diff --git a/data/key-countries-pivoted.csv b/data/key-countries-pivoted.csv new file mode 100644 index 0000000..7351e5e --- /dev/null +++ b/data/key-countries-pivoted.csv @@ -0,0 +1,63 @@ +Date,China,US,United_Kingdom,Italy,France,Germany,Spain,Iran +2020-01-22,548,1,0,0,0,0,0,0 +2020-01-23,643,1,0,0,0,0,0,0 +2020-01-24,920,2,0,0,2,0,0,0 +2020-01-25,1406,2,0,0,3,0,0,0 +2020-01-26,2075,5,0,0,3,0,0,0 +2020-01-27,2877,5,0,0,3,1,0,0 +2020-01-28,5509,5,0,0,4,4,0,0 +2020-01-29,6087,5,0,0,5,4,0,0 +2020-01-30,8141,5,0,0,5,4,0,0 +2020-01-31,9802,7,2,2,5,5,0,0 +2020-02-01,11891,8,2,2,6,8,1,0 +2020-02-02,16630,8,2,2,6,10,1,0 +2020-02-03,19716,11,2,2,6,12,1,0 +2020-02-04,23707,11,2,2,6,12,1,0 +2020-02-05,27440,11,2,2,6,12,1,0 +2020-02-06,30587,11,2,2,6,12,1,0 +2020-02-07,34110,11,3,3,6,13,1,0 +2020-02-08,36814,11,3,3,11,13,1,0 +2020-02-09,39829,11,3,3,11,14,2,0 +2020-02-10,42354,11,8,3,11,14,2,0 +2020-02-11,44386,12,8,3,11,16,2,0 +2020-02-12,44759,12,9,3,11,16,2,0 +2020-02-13,59895,13,9,3,11,16,2,0 +2020-02-14,66358,13,9,3,11,16,2,0 +2020-02-15,68413,13,9,3,12,16,2,0 +2020-02-16,70513,13,9,3,12,16,2,0 +2020-02-17,72434,13,9,3,12,16,2,0 +2020-02-18,74211,13,9,3,12,16,2,0 +2020-02-19,74619,13,9,3,12,16,2,2 +2020-02-20,75077,13,9,3,12,16,2,5 +2020-02-21,75550,15,9,20,12,16,2,18 +2020-02-22,77001,15,9,62,12,16,2,28 +2020-02-23,77022,15,9,155,12,16,2,43 +2020-02-24,77241,51,13,229,12,16,2,61 +2020-02-25,77754,51,13,322,14,17,6,95 +2020-02-26,78166,57,13,453,18,27,13,139 +2020-02-27,78600,58,15,655,38,46,15,245 +2020-02-28,78928,60,20,888,57,48,32,388 +2020-02-29,79356,68,23,1128,100,79,45,593 +2020-03-01,79932,74,36,1694,130,130,84,978 +2020-03-02,80136,98,40,2036,191,159,120,1501 +2020-03-03,80261,118,51,2502,204,196,165,2336 +2020-03-04,80386,149,86,3089,288,262,222,2922 +2020-03-05,80537,217,116,3858,380,482,259,3513 +2020-03-06,80690,262,164,4636,656,670,400,4747 +2020-03-07,80770,402,207,5883,959,799,500,5823 +2020-03-08,80823,518,274,7375,1136,1040,673,6566 +2020-03-09,80860,583,322,9172,1219,1176,1073,7161 +2020-03-10,80887,959,384,10149,1794,1457,1695,8042 +2020-03-11,80921,1281,459,12462,2293,1908,2277,9000 +2020-03-12,80932,1663,459,12462,2293,2078,2277,10075 +2020-03-13,80945,2179,802,17660,3681,3675,5232,11364 +2020-03-14,80977,2727,1144,21157,4496,4585,6391,12729 +2020-03-15,81003,3499,1145,24747,4532,5795,7798,13938 +2020-03-16,81033,4632,1551,27980,6683,7272,9942,14991 +2020-03-17,81058,6421,1960,31506,7715,9257,11748,16169 +2020-03-18,81102,7783,2642,35713,9124,12327,13910,17361 +2020-03-19,81156,13677,2716,41035,10970,15320,17963,18407 +2020-03-20,81250,19100,4014,47021,12758,19848,20410,19644 +2020-03-21,81305,25489,5067,53578,14463,22213,25374,20610 +2020-03-22,81435,33276,5745,59138,16243,24873,28768,21638 +2020-03-23,81498,43847,6726,63927,20123,29056,35136,23049 diff --git a/datapackage.json b/datapackage.json index 0152f59..fe60bf3 100644 --- a/datapackage.json +++ b/datapackage.json @@ -1,7 +1,7 @@ { - "bytes": 976643, - "count_of_rows": 25234, - "hash": "9bc713d601a6c93496187964b2922be5", + "bytes": 979273, + "count_of_rows": 25296, + "hash": "6c8d26b210a9141fbcd24652a104fae1", "name": "covid-19", "profile": "data-package", "resources": [ @@ -81,6 +81,75 @@ ] } }, + { + "bytes": 2630, + "dialect": { + "delimiter": ",", + "doubleQuote": true, + "lineTerminator": "\r\n", + "quoteChar": "\"", + "skipInitialSpace": false + }, + "encoding": "utf-8", + "format": "csv", + "hash": "8928741fc196e6f76113ed59b080b753", + "name": "key-countries-pivoted", + "path": "data/key-countries-pivoted.csv", + "profile": "data-resource", + "schema": { + "fields": [ + { + "format": "%Y-%m-%d", + "name": "Date", + "type": "date" + }, + { + "name": "China", + "title": "Cumulative total confirmed cases to date.", + "type": "integer" + }, + { + "name": "US", + "title": "Cumulative total confirmed cases to date.", + "type": "integer" + }, + { + "name": "United_Kingdom", + "title": "Cumulative total confirmed cases to date.", + "type": "integer" + }, + { + "name": "Italy", + "title": "Cumulative total confirmed cases to date.", + "type": "integer" + }, + { + "name": "France", + "title": "Cumulative total confirmed cases to date.", + "type": "integer" + }, + { + "name": "Germany", + "title": "Cumulative total confirmed cases to date.", + "type": "integer" + }, + { + "name": "Spain", + "title": "Cumulative total confirmed cases to date.", + "type": "integer" + }, + { + "name": "Iran", + "title": "Cumulative total confirmed cases to date.", + "type": "integer" + } + ], + "missingValues": [ + "None", + "" + ] + } + }, { "bytes": 289357, "dialect": { diff --git a/process.py b/process.py index d8ab800..7b0afca 100644 --- a/process.py +++ b/process.py @@ -19,9 +19,40 @@ unpivoting_fields = [ extra_keys = [{'name': 'Date', 'type': 'string'} ] extra_value = {'name': 'Case', 'type': 'number'} -def is_key_country(row): - key_countries = ['Chine', 'US', 'United Kingdom', 'Italy', 'France', 'Germany'] - return row['Country'] in key_countries + +def pivot_key_countries(package): + key_countries = ['China', 'US', 'United_Kingdom', 'Italy', 'France', 'Germany', 'Spain', 'Iran'] + for country in key_countries: + package.pkg.descriptor['resources'][1]['schema']['fields'].append(dict( + name=country, + type='integer', + title='Cumulative total confirmed cases to date.' + )) + yield package.pkg + resources = iter(package) + + data_by_province = next(resources) + yield data_by_province + + data_by_key_countries = next(resources) + def process_rows(rows): + new_row = dict(Date=None, China=None, US=None, United_Kingdom=None, Italy=None, France=None, Germany=None, Spain=None, Iran=None) + for row in rows: + country = row['Country'].replace(' ', '_') + if country in key_countries: + new_row['Date'] = row['Date'] + new_row[country] = row['Confirmed'] + if None not in new_row.values(): + yield new_row + new_row = dict(Date=None, China=None, US=None, United_Kingdom=None, Italy=None, France=None, Germany=None, Spain=None, Iran=None) + + yield process_rows(data_by_key_countries) + + data_by_country = next(resources) + yield data_by_country + + worldwide = next(resources) + yield worldwide Flow( load(f'{BASE_URL}{CONFIRMED}'), @@ -171,14 +202,14 @@ Flow( } ]), checkpoint('processed_worldwide_data'), - # Create another resource with countries aggregated + # Create another resource with key countries pivoted duplicate( source='time-series-19-covid-combined', - target_name='countries-aggregated', - target_path='data/countries-aggregated.csv' + target_name='key-countries-pivoted', + target_path='data/key-countries-pivoted.csv' ), join_with_self( - resource_name='countries-aggregated', + resource_name='key-countries-pivoted', join_key=['Date', 'Country/Region'], fields=dict( Date={ @@ -201,7 +232,7 @@ Flow( } ) ), - update_schema('countries-aggregated', missingValues=['None', ''], fields=[ + update_schema('key-countries-pivoted', missingValues=['None', ''], fields=[ { "format": "%Y-%m-%d", "name": "Date", @@ -235,6 +266,14 @@ Flow( } ]), checkpoint('processed_country_data'), + # All countries aggregated + duplicate( + source='key-countries-pivoted', + target_name='countries-aggregated', + target_path='data/countries-aggregated.csv' + ), + pivot_key_countries, + delete_fields(['Country', 'Confirmed', 'Recovered', 'Deaths'], resources='key-countries-pivoted'), # Prepare data package (name, title) and add views update_package( name='covid-19',