From 6024877690099d34470937232ce960ecf48a514a Mon Sep 17 00:00:00 2001 From: anuveyatsu Date: Sun, 22 Mar 2020 22:27:58 +0600 Subject: [PATCH 1/4] [metadata][xs]: add 'name' and 'title' properties to datapackage.json. --- datapackage.json | 6 ++++-- process.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/datapackage.json b/datapackage.json index f57d0f4..54e255c 100644 --- a/datapackage.json +++ b/datapackage.json @@ -1,7 +1,8 @@ { "bytes": 1465368, "count_of_rows": 28920, - "hash": "0645dcfca5d10e757252920dbcdbe67d", + "hash": "ac80ca047703880c26c97d8a6dc73df8", + "name": "covid-19", "profile": "data-package", "resources": [ { @@ -79,5 +80,6 @@ ] } } - ] + ], + "title": "Novel Coronavirus 2019" } \ No newline at end of file diff --git a/process.py b/process.py index 546b77e..0177661 100644 --- a/process.py +++ b/process.py @@ -1,4 +1,4 @@ -from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_resource, join, add_computed_field, delete_fields +from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, join, add_computed_field, delete_fields BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/' CONFIRMED = 'time_series_19-covid-Confirmed.csv' @@ -57,5 +57,6 @@ Flow( ), delete_fields(['Case']), update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='time-series-19-covid-combined.csv'), + update_package(name='covid-19', title='Novel Coronavirus 2019'), dump_to_path() ).results()[0] From 8eb6078b2ea4a8bf611f79149f93e059c62b9aec Mon Sep 17 00:00:00 2001 From: anuveyatsu Date: Mon, 23 Mar 2020 00:17:41 +0600 Subject: [PATCH 2/4] [checkpoints][xs]: save data so it's easy to develop new flows. --- .gitignore | 3 +++ process.py | 1 + 2 files changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 4d34fa8..3991ac8 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,6 @@ static # put you virtual env names here env36/ + +# dataflows checkpoints +.checkpoints/ diff --git a/process.py b/process.py index 0177661..08661f6 100644 --- a/process.py +++ b/process.py @@ -23,6 +23,7 @@ Flow( load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), load(f'{BASE_URL}{DEATH}'), + checkpoint('load_data'), unpivot(unpivoting_fields, extra_keys, extra_value), find_replace([{'name': 'Date', 'patterns': [{'find': '/', 'replace': '-'}]}]), to_normal_date, From 6389e94d6c60b4ae83b70a61801a6174e66e435e Mon Sep 17 00:00:00 2001 From: anuveyatsu Date: Mon, 23 Mar 2020 00:19:16 +0600 Subject: [PATCH 3/4] [data][xs]: move the data to `/data/` dir. --- .../time-series-19-covid-combined.csv | 0 datapackage.json | 2 +- process.py | 4 ++-- 3 files changed, 3 insertions(+), 3 deletions(-) rename time-series-19-covid-combined.csv => data/time-series-19-covid-combined.csv (100%) diff --git a/time-series-19-covid-combined.csv b/data/time-series-19-covid-combined.csv similarity index 100% rename from time-series-19-covid-combined.csv rename to data/time-series-19-covid-combined.csv diff --git a/datapackage.json b/datapackage.json index 54e255c..3abd3cb 100644 --- a/datapackage.json +++ b/datapackage.json @@ -20,7 +20,7 @@ "format": "csv", "hash": "7b65c350a6e621e770bdce075df8b657", "name": "time-series-19-covid-combined", - "path": "time-series-19-covid-combined.csv", + "path": "data/time-series-19-covid-combined.csv", "profile": "tabular-data-resource", "schema": { "fields": [ diff --git a/process.py b/process.py index 08661f6..e8bcd9c 100644 --- a/process.py +++ b/process.py @@ -1,4 +1,4 @@ -from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, join, add_computed_field, delete_fields +from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, join, join_with_self, add_computed_field, delete_fields, checkpoint, duplicate BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/' CONFIRMED = 'time_series_19-covid-Confirmed.csv' @@ -57,7 +57,7 @@ Flow( with_='{Case}' ), delete_fields(['Case']), - update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='time-series-19-covid-combined.csv'), + update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'), update_package(name='covid-19', title='Novel Coronavirus 2019'), dump_to_path() ).results()[0] From 6b68769bf81825d057973318622a68dfb41a5476 Mon Sep 17 00:00:00 2001 From: anuveyatsu Date: Mon, 23 Mar 2020 00:21:17 +0600 Subject: [PATCH 4/4] [data][xl]: add worldwide aggreated data by date. --- data/worldwide-aggregated.csv | 61 +++++++++++++++++++++++++++++++++++ datapackage.json | 49 ++++++++++++++++++++++++++-- process.py | 29 +++++++++++++++++ 3 files changed, 136 insertions(+), 3 deletions(-) create mode 100644 data/worldwide-aggregated.csv diff --git a/data/worldwide-aggregated.csv b/data/worldwide-aggregated.csv new file mode 100644 index 0000000..50c80af --- /dev/null +++ b/data/worldwide-aggregated.csv @@ -0,0 +1,61 @@ +Confirmed,Date,Deaths,Recovered +555,2020-01-22,17,28 +653,2020-01-23,18,30 +941,2020-01-24,26,36 +1434,2020-01-25,42,39 +2118,2020-01-26,56,52 +2927,2020-01-27,82,61 +5578,2020-01-28,131,107 +6166,2020-01-29,133,126 +8234,2020-01-30,171,143 +9927,2020-01-31,213,222 +12038,2020-02-01,259,284 +16787,2020-02-02,362,472 +19881,2020-02-03,426,623 +23892,2020-02-04,492,852 +27635,2020-02-05,564,1124 +30817,2020-02-06,634,1487 +34391,2020-02-07,719,2011 +37120,2020-02-08,806,2616 +40150,2020-02-09,906,3244 +42762,2020-02-10,1013,3946 +44802,2020-02-11,1113,4683 +45221,2020-02-12,1118,5150 +60368,2020-02-13,1371,6295 +66885,2020-02-14,1523,8058 +69030,2020-02-15,1666,9395 +71224,2020-02-16,1770,10865 +73258,2020-02-17,1868,12583 +75136,2020-02-18,2007,14352 +75639,2020-02-19,2122,16121 +76197,2020-02-20,2247,18177 +76823,2020-02-21,2251,18890 +78579,2020-02-22,2458,22886 +78965,2020-02-23,2469,23394 +79568,2020-02-24,2629,25227 +80413,2020-02-25,2708,27905 +81395,2020-02-26,2770,30384 +82754,2020-02-27,2814,33277 +84120,2020-02-28,2872,36711 +86011,2020-02-29,2941,39782 +88369,2020-03-01,2996,42716 +90306,2020-03-02,3085,45602 +92840,2020-03-03,3160,48228 +95120,2020-03-04,3254,51170 +97882,2020-03-05,3348,53796 +101784,2020-03-06,3460,55865 +105821,2020-03-07,3558,58358 +109795,2020-03-08,3802,60694 +113561,2020-03-09,3988,62494 +118592,2020-03-10,4262,64404 +125865,2020-03-11,4615,67003 +128343,2020-03-12,4720,68324 +145193,2020-03-13,5404,70251 +156094,2020-03-14,5819,72624 +167446,2020-03-15,6440,76034 +181527,2020-03-16,7126,78088 +197142,2020-03-17,7905,80840 +214910,2020-03-18,8733,83207 +242708,2020-03-19,9867,84854 +272166,2020-03-20,11299,87256 +304524,2020-03-21,12973,91499 diff --git a/datapackage.json b/datapackage.json index 3abd3cb..0b434f7 100644 --- a/datapackage.json +++ b/datapackage.json @@ -1,7 +1,7 @@ { - "bytes": 1465368, - "count_of_rows": 28920, - "hash": "ac80ca047703880c26c97d8a6dc73df8", + "bytes": 1467076, + "count_of_rows": 28980, + "hash": "6e25a726932e6dabaaa8b8e5ed578dd7", "name": "covid-19", "profile": "data-package", "resources": [ @@ -79,6 +79,49 @@ "" ] } + }, + { + "bytes": 1708, + "dialect": { + "delimiter": ",", + "doubleQuote": true, + "lineTerminator": "\r\n", + "quoteChar": "\"", + "skipInitialSpace": false + }, + "encoding": "utf-8", + "format": "csv", + "hash": "f261b5b527542ceace4f2c7941d69e40", + "name": "worldwide-aggregated", + "path": "data/worldwide-aggregated.csv", + "profile": "data-resource", + "schema": { + "fields": [ + { + "decimalChar": ".", + "groupChar": "", + "name": "Confirmed", + "type": "number" + }, + { + "format": "%Y-%m-%d", + "name": "Date", + "type": "date" + }, + { + "decimalChar": ".", + "groupChar": "", + "name": "Deaths", + "type": "number" + }, + { + "decimalChar": ".", + "groupChar": "", + "name": "Recovered", + "type": "number" + } + ] + } } ], "title": "Novel Coronavirus 2019" diff --git a/process.py b/process.py index e8bcd9c..e1d4b1b 100644 --- a/process.py +++ b/process.py @@ -59,5 +59,34 @@ Flow( delete_fields(['Case']), update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'), update_package(name='covid-19', title='Novel Coronavirus 2019'), + dump_to_path(), + checkpoint('processed_data'), + # Duplicate the stream to create aggregated data + duplicate( + source='time-series-19-covid-combined', + target_name='worldwide-aggregated', + target_path='worldwide-aggregated.csv' + ), + join_with_self( + resource_name='worldwide-aggregated', + join_key=['Date'], + fields=dict( + Date={ + 'name': 'Date' + }, + Confirmed={ + 'name': 'Confirmed', + 'aggregate': 'sum' + }, + Recovered={ + 'name': 'Recovered', + 'aggregate': 'sum' + }, + Deaths={ + 'name': 'Deaths', + 'aggregate': 'sum' + } + ) + ), dump_to_path() ).results()[0]