commit a217b0ee17e1618e822c963762aed92372e9f88c Author: Stefano Pigozzi Date: Fri Mar 12 12:07:11 2021 +0100 ✨ First commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f7275bb --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +venv/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/discord.xml b/.idea/discord.xml new file mode 100644 index 0000000..cd711a0 --- /dev/null +++ b/.idea/discord.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..479a247 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,55 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d55e259 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,9 @@ + + + + IDE + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..47b7880 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/pandasdmx.ipynb b/pandasdmx.ipynb new file mode 100644 index 0000000..ea5fdf4 --- /dev/null +++ b/pandasdmx.ipynb @@ -0,0 +1,316 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# PandaSDMX\n", + "\n", + "- [Documentazione aggiornata (v1.4.1)](https://pandasdmx.readthedocs.io/en/latest/)\n", + "- [Esempio approfondito (ma non troppo aggiornato)](https://pandasdmx.readthedocs.io/en/latest/walkthrough.html#sdmx-workflow)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Installazione\n", + "\n", + "- L'ultima versione non funziona con Pydantic 1.8.1 ma richiede 1.7 ([dr-leo/pandaSDMX#204](https://github.com/dr-leo/pandaSDMX/issues/204))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandasdmx in ./venv/lib/python3.9/site-packages (1.4.1)\r\n", + "Requirement already satisfied: pydantic==1.7 in ./venv/lib/python3.9/site-packages (1.7)\r\n", + "Requirement already satisfied: requests>=2.7 in ./venv/lib/python3.9/site-packages (from pandasdmx) (2.25.1)\r\n", + "Requirement already satisfied: lxml>=3.6 in ./venv/lib/python3.9/site-packages (from pandasdmx) (4.6.2)\r\n", + "Requirement already satisfied: pandas>=1.0 in ./venv/lib/python3.9/site-packages (from pandasdmx) (1.2.3)\r\n", + "Requirement already satisfied: pytz>=2017.3 in ./venv/lib/python3.9/site-packages (from pandas>=1.0->pandasdmx) (2021.1)\r\n", + "Requirement already satisfied: numpy>=1.16.5 in ./venv/lib/python3.9/site-packages (from pandas>=1.0->pandasdmx) (1.20.1)\r\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in ./venv/lib/python3.9/site-packages (from pandas>=1.0->pandasdmx) (2.8.1)\r\n", + "Requirement already satisfied: six>=1.5 in ./venv/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas>=1.0->pandasdmx) (1.15.0)\r\n", + "Requirement already satisfied: idna<3,>=2.5 in ./venv/lib/python3.9/site-packages (from requests>=2.7->pandasdmx) (2.10)\r\n", + "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.9/site-packages (from requests>=2.7->pandasdmx) (2020.12.5)\r\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in ./venv/lib/python3.9/site-packages (from requests>=2.7->pandasdmx) (1.26.3)\r\n", + "Requirement already satisfied: chardet<5,>=3.0.2 in ./venv/lib/python3.9/site-packages (from requests>=2.7->pandasdmx) (4.0.0)\r\n" + ] + } + ], + "source": [ + "!pip install pandasdmx pydantic==1.7" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Esempio" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/mnt/tera/ext4/code/sdmx-sandbox/venv/lib/python3.9/site-packages/pandasdmx/remote.py:11: RuntimeWarning: optional dependency requests_cache is not installed; cache options to Session() have no effect\n", + " warn(\n" + ] + } + ], + "source": [ + "import pandas\n", + "import pandasdmx\n", + "\n", + "# Per type annotations\n", + "import pandasdmx.message\n", + "import pandasdmx.model\n", + "import pandasdmx.source" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "È possibile selezionare tra più fonti di dati, tra i quali Eurostat:" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/plain": "" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eurostat: pandasdmx.Request = pandasdmx.Request(\"ESTAT\")\n", + "eurostat" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Sembra che PandaSDMX implementi la funzionalità che cercavamo di ricerca metadati:" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "\n
\n id: 'IDREF351597'\n prepared: '2021-03-12T10:31:59.307000+00:00'\n receiver: \n sender: \n source: \n test: False\n response: \n DataflowDefinition (6573): DS-018995 DS-022469 DS-032655 DS-043227 DS...\n DataStructureDefinition (6573): DSD_DS-018995 DSD_DS-022469 DSD_DS-03..." + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Scarica i metadati di TUTTI dataflow disponibili su Eurostat\n", + "# Ci mette qualche minuto: i dataflow sono 6573!\n", + "flow_msg: pandasdmx.message.Message = eurostat.dataflow()\n", + "flow_msg" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "source": [ + "# Convertiamo i risultati in due Series di pandas, una con i dataflow e una con la loro relativa struttura\n", + "_dict: dict[str, pandas.Series] = flow_msg.to_pandas()\n", + "dataflows = _dict[\"dataflow\"]\n", + "structure = _dict[\"structure\"]\n", + "dataflows, structure" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "execution_count": 13, + "outputs": [ + { + "data": { + "text/plain": "(DS-018995 EU trade since 1988 by SITC\n DS-022469 EXTRA EU trade since 1999 by mode of transport...\n DS-032655 EU trade since 1988 by BEC\n DS-043227 EFTA trade since 1995 by SITC\n DS-066341 Sold production, exports and imports by PRODCO...\n ... \n yth_incl_120 Young people living in households with very lo...\n yth_part_010 Frequency of getting together with relatives o...\n yth_part_020 Frequency of contacts with relatives or friend...\n yth_part_030 Participation of young people in activities of...\n yth_volunt_010 Participation of young people in informal volu...\n Length: 6573, dtype: object,\n DSD_DS-018995 \n DSD_DS-022469 \n DSD_DS-032655 \n DSD_DS-043227 \n DSD_DS-066341 \n ..\n DSD_yth_incl_120 \n DSD_yth_part_010 \n DSD_yth_part_020 \n DSD_yth_part_030 \n DSD_yth_volunt_010 \n Length: 6573, dtype: object)" + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [ + { + "data": { + "text/plain": "educ_enrl1ad Students by ISCED level, study intensity and sex\neduc_enrl1at Students by ISCED level, type of institution a...\neduc_enrl1tl Students by ISCED level, age and sex\neduc_enrl5 Tertiary students (ISCED 5-6) by field of educ...\neduc_enrl6 Tertiary students (ISCED 5-6) non-citizens, n...\neduc_enrl8 Tertiary students (ISCED 5-6) by country of ci...\neduc_enrllng1 Students in ISCED 1-3 by modern foreign langua...\neduc_enrllng2 Students in ISCED 1-3 by number of modern fore...\neduc_fiaid Financial aid to students\neduc_ilev Distribution of pupils/ students by level\neduc_iste Pupil/ student - teacher ratio and average cla...\neduc_mofo_dst Foreign students by level of education and cou...\neduc_mofo_fld Foreign students by level and field of education\neduc_mofo_gen Foreign students by level of education and sex\neduc_mofo_orig Foreign students by level of education and cou...\neduc_momo_dst Students going abroad by level of education an...\neduc_momo_fld Students from abroad by level and field of edu...\neduc_momo_gen Students from abroad by level of education and...\neduc_momo_orig Students from abroad by level of education and...\neduc_outc_pisa Underachieving 15-year-old students by sex and...\neduc_renrlrg1 Students by level of education, orientation, s...\neduc_renrlrg3 Students by age, sex and NUTS 2 regions\neduc_thmob Student mobility\neduc_uoe_enra01 Pupils and students enrolled by education leve...\neduc_uoe_enra02 Pupils and students enrolled by education leve...\neduc_uoe_enra03 Pupils and students enrolled by education leve...\neduc_uoe_enra04 Pupils and students by education level - as % ...\neduc_uoe_enra05 Pupils and students in education by age groups...\neduc_uoe_enra06 Pupils and students in education aged 30 and o...\neduc_uoe_enra07 Expected school years of pupils and students b...\neduc_uoe_enra08 Students in post-compulsory education - as % o...\neduc_uoe_enra09 Students participation at the end of compulsor...\neduc_uoe_enra11 Pupils and students enrolled by education leve...\neduc_uoe_enra12 Pupils and students enrolled by sex, age and N...\neduc_uoe_enra13 Distribution of pupils and students enrolled i...\neduc_uoe_enra16 Pupils and students enrolled by education leve...\neduc_uoe_enrt01 Students enrolled in tertiary education by edu...\neduc_uoe_enrt02 Students enrolled in tertiary education by edu...\neduc_uoe_enrt03 Students enrolled in tertiary education by edu...\neduc_uoe_enrt04 Distribution of students enrolled at tertiary ...\neduc_uoe_enrt05 Ratio of the proportion of tertiary students o...\neduc_uoe_enrt06 Students enrolled in tertiary education by edu...\neduc_uoe_enrt07 Students in tertiary education by age groups -...\neduc_uoe_enrt08 Students in tertiary education - as % of 20-24...\neduc_uoe_fina01 Financial aid to students by education level -...\neduc_uoe_fine09 Public expenditure on education per pupil/stud...\neduc_uoe_fine10 Pupils and students enrolled by education leve...\neduc_uoe_fini04 Annual expenditure on educational institutions...\neduc_uoe_fini06 Ratio of annual expenditure per student at the...\neduc_uoe_mobs01 Mobile students from abroad enrolled by educat...\neduc_uoe_mobs02 Mobile students from abroad enrolled by educat...\neduc_uoe_mobs03 Share of mobile students from abroad enrolled ...\neduc_uoe_mobs04 Distribution of mobile students from abroad en...\neduc_uoe_perp04 Ratio of pupils and students to teachers and a...\nhrst_fl_tefor Participation of foreign students in tertiary ...\ntsc00028 Doctorate students in science and technology f...\ndtype: object" + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cerchiamo nella Series i dataflows la cui descrizione contiene \"student\"\n", + "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html\n", + "student_dataflows = dataflows[dataflows.str.contains(\"student\", case=False)]\n", + "student_dataflows" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 33, + "outputs": [ + { + "data": { + "text/plain": "(,\n )" + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Prendiamo il primo e andiamo a scaricare i dati corrispondenti\n", + "dataflow_label = student_dataflows.index[0]\n", + "data_msg: pandasdmx.message.Message = eurostat.dataflow(dataflow_label)\n", + "dataflow: pandasdmx.model.DataflowDefinition = data_msg.dataflow[dataflow_label]\n", + "structure: pandasdmx.source.DataStructureDefinition = dataflow.structure\n", + "dataflow, structure\n", + "\n", + "# Si possono usare i dati della struttura richiesti in precedenza?" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 36, + "outputs": [ + { + "data": { + "text/plain": "([], [], [], [])" + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Analizziamo la struttura dei dati\n", + "# È composta da:\n", + "# - dimensioni\n", + "# - attributi\n", + "# - misure\n", + "# Cosa cambia tra uno e l'altro?\n", + "\n", + "structure.dimensions.components, structure.attributes.components, structure.measures.components, structure.annotations" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/sdmx-sandbox.iml b/sdmx-sandbox.iml new file mode 100644 index 0000000..456e24f --- /dev/null +++ b/sdmx-sandbox.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file