From 3f8dfe6f5fa91a790d063a0a16840ef292d27eda Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Sat, 20 Mar 2021 23:58:26 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Find=20out=20how=20to=20filter=20by?= =?UTF-8?q?=20date?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandasdmx.ipynb | 473 ++++++++++++++++++++++++++++-------------------- 1 file changed, 274 insertions(+), 199 deletions(-) diff --git a/pandasdmx.ipynb b/pandasdmx.ipynb index ff493cf..2086233 100644 --- a/pandasdmx.ipynb +++ b/pandasdmx.ipynb @@ -10,10 +10,7 @@ "- [Esempio approfondito (ma non troppo aggiornato)](https://pandasdmx.readthedocs.io/en/latest/walkthrough.html)" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { @@ -24,36 +21,13 @@ "- L'ultima versione non funziona con Pydantic 1.8.1 ma richiede 1.7 ([dr-leo/pandaSDMX#204](https://github.com/dr-leo/pandaSDMX/issues/204))" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 1, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: pandasdmx in ./venv/lib/python3.9/site-packages (1.4.1)\r\n", - "Requirement already satisfied: pydantic==1.7 in ./venv/lib/python3.9/site-packages (1.7)\r\n", - "Requirement already satisfied: requests>=2.7 in ./venv/lib/python3.9/site-packages (from pandasdmx) (2.25.1)\r\n", - "Requirement already satisfied: pandas>=1.0 in ./venv/lib/python3.9/site-packages (from pandasdmx) (1.2.3)\r\n", - "Requirement already satisfied: lxml>=3.6 in ./venv/lib/python3.9/site-packages (from pandasdmx) (4.6.2)\r\n", - "Requirement already satisfied: pytz>=2017.3 in ./venv/lib/python3.9/site-packages (from pandas>=1.0->pandasdmx) (2021.1)\r\n", - "Requirement already satisfied: numpy>=1.16.5 in ./venv/lib/python3.9/site-packages (from pandas>=1.0->pandasdmx) (1.20.1)\r\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in ./venv/lib/python3.9/site-packages (from pandas>=1.0->pandasdmx) (2.8.1)\r\n", - "Requirement already satisfied: six>=1.5 in ./venv/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas>=1.0->pandasdmx) (1.15.0)\r\n", - "Requirement already satisfied: chardet<5,>=3.0.2 in ./venv/lib/python3.9/site-packages (from requests>=2.7->pandasdmx) (4.0.0)\r\n", - "Requirement already satisfied: idna<3,>=2.5 in ./venv/lib/python3.9/site-packages (from requests>=2.7->pandasdmx) (2.10)\r\n", - "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.9/site-packages (from requests>=2.7->pandasdmx) (2020.12.5)\r\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./venv/lib/python3.9/site-packages (from requests>=2.7->pandasdmx) (1.26.3)\r\n" - ] - } - ], + "execution_count": null, + "outputs": [], "source": [ "!pip install pandasdmx pydantic==1.7" ], @@ -70,25 +44,13 @@ "## Esempio" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 2, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/mnt/tera/ext4/code/sdmx-sandbox/venv/lib/python3.9/site-packages/pandasdmx/remote.py:11: RuntimeWarning: optional dependency requests_cache is not installed; cache options to Session() have no effect\n", - " warn(\n" - ] - } - ], + "execution_count": null, + "outputs": [], "source": [ "import pandas\n", "import pandasdmx\n", @@ -112,10 +74,7 @@ "È possibile selezionare tra più fonti di dati, tra i quali Eurostat (`ESTAT`)." ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { @@ -126,25 +85,13 @@ "Come prima cosa, è necessario creare un'istanza di `pandasdmx.Request`:" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 3, - "outputs": [ - { - "data": { - "text/plain": "" - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "eurostat: pandasdmx.Request = pandasdmx.Request(\"ESTAT\")\n", "eurostat" @@ -166,25 +113,13 @@ "Poi, scarichiamo _tutti_ i dataflow disponibili usando `.dataflow()` sul client creato in precedenza per effettuare una richiesta al server Eurostat, creando un `pandasdmx.message.Message`:" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 4, - "outputs": [ - { - "data": { - "text/plain": "\n
\n id: 'IDREF382067'\n prepared: '2021-03-15T01:45:49.005000+00:00'\n receiver: \n sender: \n source: \n test: False\n response: \n DataflowDefinition (6573): DS-018995 DS-022469 DS-032655 DS-043227 DS...\n DataStructureDefinition (6573): DSD_DS-018995 DSD_DS-022469 DSD_DS-03..." - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "all_flows_msg: pandasdmx.message.Message = eurostat.dataflow()\n", "all_flows_msg" @@ -211,6 +146,8 @@ }, { "cell_type": "code", + "execution_count": null, + "outputs": [], "source": [ "# Converte i risultati in due Series di pandas, una con i dataflow e una con la loro relativa struttura\n", "_dict: dict[str, pandas.Series] = all_flows_msg.to_pandas()\n", @@ -223,32 +160,12 @@ "pycharm": { "name": "#%%\n" } - }, - "execution_count": 5, - "outputs": [ - { - "data": { - "text/plain": "(DS-018995 EU trade since 1988 by SITC\n DS-022469 EXTRA EU trade since 1999 by mode of transport...\n DS-032655 EU trade since 1988 by BEC\n DS-043227 EFTA trade since 1995 by SITC\n DS-066341 Sold production, exports and imports by PRODCO...\n ... \n yth_incl_120 Young people living in households with very lo...\n yth_part_010 Frequency of getting together with relatives o...\n yth_part_020 Frequency of contacts with relatives or friend...\n yth_part_030 Participation of young people in activities of...\n yth_volunt_010 Participation of young people in informal volu...\n Length: 6573, dtype: object,\n DSD_DS-018995 \n DSD_DS-022469 \n DSD_DS-032655 \n DSD_DS-043227 \n DSD_DS-066341 \n ..\n DSD_yth_incl_120 \n DSD_yth_part_010 \n DSD_yth_part_020 \n DSD_yth_part_030 \n DSD_yth_volunt_010 \n Length: 6573, dtype: object)" - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ] + } }, { "cell_type": "code", - "execution_count": 6, - "outputs": [ - { - "data": { - "text/plain": "educ_enrl1ad Students by ISCED level, study intensity and sex\neduc_enrl1at Students by ISCED level, type of institution a...\neduc_enrl1tl Students by ISCED level, age and sex\neduc_enrl5 Tertiary students (ISCED 5-6) by field of educ...\neduc_enrl6 Tertiary students (ISCED 5-6) non-citizens, n...\neduc_enrl8 Tertiary students (ISCED 5-6) by country of ci...\neduc_enrllng1 Students in ISCED 1-3 by modern foreign langua...\neduc_enrllng2 Students in ISCED 1-3 by number of modern fore...\neduc_fiaid Financial aid to students\neduc_ilev Distribution of pupils/ students by level\neduc_iste Pupil/ student - teacher ratio and average cla...\neduc_mofo_dst Foreign students by level of education and cou...\neduc_mofo_fld Foreign students by level and field of education\neduc_mofo_gen Foreign students by level of education and sex\neduc_mofo_orig Foreign students by level of education and cou...\neduc_momo_dst Students going abroad by level of education an...\neduc_momo_fld Students from abroad by level and field of edu...\neduc_momo_gen Students from abroad by level of education and...\neduc_momo_orig Students from abroad by level of education and...\neduc_outc_pisa Underachieving 15-year-old students by sex and...\neduc_renrlrg1 Students by level of education, orientation, s...\neduc_renrlrg3 Students by age, sex and NUTS 2 regions\neduc_thmob Student mobility\neduc_uoe_enra01 Pupils and students enrolled by education leve...\neduc_uoe_enra02 Pupils and students enrolled by education leve...\neduc_uoe_enra03 Pupils and students enrolled by education leve...\neduc_uoe_enra04 Pupils and students by education level - as % ...\neduc_uoe_enra05 Pupils and students in education by age groups...\neduc_uoe_enra06 Pupils and students in education aged 30 and o...\neduc_uoe_enra07 Expected school years of pupils and students b...\neduc_uoe_enra08 Students in post-compulsory education - as % o...\neduc_uoe_enra09 Students participation at the end of compulsor...\neduc_uoe_enra11 Pupils and students enrolled by education leve...\neduc_uoe_enra12 Pupils and students enrolled by sex, age and N...\neduc_uoe_enra13 Distribution of pupils and students enrolled i...\neduc_uoe_enra16 Pupils and students enrolled by education leve...\neduc_uoe_enrt01 Students enrolled in tertiary education by edu...\neduc_uoe_enrt02 Students enrolled in tertiary education by edu...\neduc_uoe_enrt03 Students enrolled in tertiary education by edu...\neduc_uoe_enrt04 Distribution of students enrolled at tertiary ...\neduc_uoe_enrt05 Ratio of the proportion of tertiary students o...\neduc_uoe_enrt06 Students enrolled in tertiary education by edu...\neduc_uoe_enrt07 Students in tertiary education by age groups -...\neduc_uoe_enrt08 Students in tertiary education - as % of 20-24...\neduc_uoe_fina01 Financial aid to students by education level -...\neduc_uoe_fine09 Public expenditure on education per pupil/stud...\neduc_uoe_fine10 Pupils and students enrolled by education leve...\neduc_uoe_fini04 Annual expenditure on educational institutions...\neduc_uoe_fini06 Ratio of annual expenditure per student at the...\neduc_uoe_mobs01 Mobile students from abroad enrolled by educat...\neduc_uoe_mobs02 Mobile students from abroad enrolled by educat...\neduc_uoe_mobs03 Share of mobile students from abroad enrolled ...\neduc_uoe_mobs04 Distribution of mobile students from abroad en...\neduc_uoe_perp04 Ratio of pupils and students to teachers and a...\nhrst_fl_tefor Participation of foreign students in tertiary ...\ntsc00028 Doctorate students in science and technology f...\ndtype: object" - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "# Cerchiamo nella Series i allflows la cui descrizione contiene \"student\"\n", "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html\n", @@ -268,25 +185,13 @@ "Per continuare gli esperimenti, prendiamo il primo dataflow tra quelli contenenti `\"student\"` nel label:" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 7, - "outputs": [ - { - "data": { - "text/plain": "'educ_enrl1ad'" - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "my_flow_label = student_flows.index[0]\n", "my_flow_label" @@ -304,25 +209,13 @@ "Usiamo il label per chiamare di nuovo `.dataflow()`, specificando però stavolta il dataflow di cui ci interessano i dettagli:" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 8, - "outputs": [ - { - "data": { - "text/plain": "" - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "my_flow_msg: pandasdmx.message.Message = eurostat.dataflow(my_flow_label)\n", "my_flow: pandasdmx.model.DataflowDefinition = my_flow_msg.dataflow[my_flow_label]\n", @@ -345,25 +238,13 @@ "Scopriamo prima il label della structure, poi scarichiamo da Eurostat la structure del dataflow che ci interessa con il metodo `.datastructure()`:" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 9, - "outputs": [ - { - "data": { - "text/plain": "" - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "my_struct_label: pandasdmx.source.DataStructureDefinition = my_flow.structure.id\n", "my_struct_msg: pandasdmx.message.Message = eurostat.datastructure(my_struct_label)\n", @@ -391,25 +272,13 @@ "> __Annotations__: commenti che possono essere aggiunti al dataflow" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 10, - "outputs": [ - { - "data": { - "text/plain": "([],\n >,\n ; >,\n ; ; ; ; ; ; >)" - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "my_struct.annotations, my_struct.measures, my_struct.attributes, my_struct.dimensions" ], @@ -426,28 +295,42 @@ "Infine, richiediamo i dati da Eurostat, limitandoli a quelli dell'`IT`alia dal 2010 in poi e selezionando solo il `WORKTIME` `TOTAL`, e convertiamoli in una Series multi-chiave:" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 28, - "outputs": [ - { - "data": { - "text/plain": "FREQ UNIT ISCED97 SEX WORKTIME GEO TIME_PERIOD\nA NR ED0 F TOTAL IT 2010 808706.0\n 2011 811615.0\n 2012 815656.0\n M TOTAL IT 2010 872281.0\n 2011 876225.0\n ... \n UNK M TOTAL IT 2011 NaN\n 2012 NaN\n T TOTAL IT 2010 NaN\n 2011 NaN\n 2012 NaN\nName: value, Length: 279, dtype: float64" - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ "my_data_msg: pandasdmx.message.Message = eurostat.data(my_flow_label, key={\"GEO\": \"IT\", \"WORKTIME\": \"TOTAL\"}, params={\"startPeriod\": \"2010\"})\n", - "my_data: pandas.Series = my_data_msg.to_pandas()\n", + "my_data_series: pandas.Series = my_data_msg.to_pandas()\n", + "my_data_series" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "> __DataFrame__: Tabella di dati di `pandas`, implementata come array di Series\n", + "\n", + "Per avere una rappresentazione migliore dei dati sul notebook, convertiamo la Series a un DataFrame:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "my_data: pandas.DataFrame = my_data_series.to_frame()\n", "my_data" ], "metadata": { @@ -460,31 +343,43 @@ { "cell_type": "markdown", "source": [ - "Abbiamo ricevuto i dati, e possiamo manipolarli come una qualsiasi series di `pandas` (le quali sono molto simili a tabelle SQL in-memory):" + "Inoltre, per semplificarne le query, \"appiattiamo\" il [MultiIndex](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html) trasformandolo in normalissime colonne:" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { "cell_type": "code", - "execution_count": 30, - "outputs": [ - { - "data": { - "text/plain": "SEX TIME_PERIOD\nF 2010 808706.0\n 2011 811615.0\n 2012 815656.0\nM 2010 872281.0\n 2011 876225.0\n 2012 879256.0\nT 2010 1680987.0\n 2011 1687840.0\n 2012 1694912.0\nName: value, dtype: float64" - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } + "execution_count": null, + "outputs": [], + "source": [ + "my_data.reset_index(inplace=True)\n", + "my_data" ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Abbiamo finalmente i dati, e possiamo manipolarli come un qualsiasi DataFrame di `pandas`, in modo molto simile a una tabella SQL:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], "source": [ "# Il numero di studenti [M]aschi, [F]emmine e [T]otali in Italia nel [2010], [2011] e [2012]\n", - "my_data.groupby([\"SEX\", \"TIME_PERIOD\"]).first()" + "my_data.groupby([\"FREQ\", \"TIME_PERIOD\", \"SEX\"]).first()" ], "metadata": { "collapsed": false, @@ -508,10 +403,7 @@ "- `OECD` - Organisation for Economic Cooperation and Development" ], "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } + "collapsed": false } }, { @@ -523,10 +415,193 @@ "\n", "[SQLAlchemy](https://www.sqlalchemy.org/) potrebbe essere utile in questo caso; non sono particolarmente familiare con l'[ORM di Django](https://docs.djangoproject.com/en/3.1/topics/db/models/), ma sembrano molto simili (anche se [si direbbe che SQLAlchemy supporti query più complesse](https://stackoverflow.com/questions/18199053/example-of-what-sqlalchemy-can-do-and-django-orm-cannot))." ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Filtraggio in base a `TIME_PERIOD`\n", + "\n", + "È possibile capire se un DataFrame ha una colonna `TIME_PERIOD` in questo modo:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "\"TIME_PERIOD\" in list(my_data.columns)" + ], "metadata": { "collapsed": false, "pycharm": { - "name": "#%% md\n" + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "I `TIME_PERIOD` possono essere misurati in modi diversi: anni, quadrimestri, giorni, etc...\n", + "\n", + "I valori possibili sono:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "list(my_struct.dimensions.get(\"FREQ\").local_representation.enumerated)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Per capire quali sono disponibili, si può effettuare una query aggregata:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "list(my_data.groupby([\"FREQ\"]).any().index)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "In questo caso, è disponibile solo `A`, il che significa che le misurazioni sono **eseguite solo annualmente**.\n", + "\n", + "Possiamo trovare il \"periodo\" più recente con una query sulla tabella:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "latest_period = my_data[\"TIME_PERIOD\"].max()\n", + "latest_period" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Possiamo filtrare i dati in modo da avere solo quelli del periodo desiderato:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "my_data.loc[my_data[\"TIME_PERIOD\"] == latest_period]" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "In generale, possiamo applicare ulteriori filtri effettuando accessi agli elementi (`__getitem__`) della proprietà `loc` del dataframe:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "my_data.loc[my_data[\"SEX\"] == \"M\"]" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 26, + "outputs": [ + { + "data": { + "text/plain": " FREQ UNIT ISCED97 SEX WORKTIME GEO TIME_PERIOD value\n5 A NR ED0 M TOTAL IT 2012 879256.0", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
FREQUNITISCED97SEXWORKTIMEGEOTIME_PERIODvalue
5ANRED0MTOTALIT2012879256.0
\n
" + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " my_data\n", + " .loc[my_data[\"TIME_PERIOD\"] == latest_period]\n", + " .loc[my_data[\"SEX\"] == \"M\"]\n", + " .loc[my_data[\"ISCED97\"] == \"ED0\"]\n", + ")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" } } }