From d3dde96de59037d89745196eaafa1d7c1bf06ab7 Mon Sep 17 00:00:00 2001 From: anuveyatsu Date: Mon, 23 Mar 2020 17:47:43 +0600 Subject: [PATCH] [data][s]: add another resource with data per country. We have a resource with data by province but it is not aggregated by country, e.g., if you need data for China you'd need to sum up all its provinces. --- process.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/process.py b/process.py index e8d3899..2a05157 100644 --- a/process.py +++ b/process.py @@ -1,4 +1,4 @@ -from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, update_schema, join, join_with_self, add_computed_field, delete_fields, checkpoint, duplicate +from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, update_schema, join, join_with_self, add_computed_field, delete_fields, checkpoint, duplicate, filter_rows BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/' CONFIRMED = 'time_series_19-covid-Confirmed.csv' @@ -19,6 +19,10 @@ unpivoting_fields = [ extra_keys = [{'name': 'Date', 'type': 'string'} ] extra_value = {'name': 'Case', 'type': 'number'} +def is_key_country(row): + key_countries = ['Chine', 'US', 'United Kingdom', 'Italy', 'France', 'Germany'] + return row['Country'] in key_countries + Flow( load(f'{BASE_URL}{CONFIRMED}'), load(f'{BASE_URL}{RECOVERED}'), @@ -115,7 +119,7 @@ Flow( duplicate( source='time-series-19-covid-combined', target_name='worldwide-aggregated', - target_path='worldwide-aggregated.csv' + target_path='data/worldwide-aggregated.csv' ), join_with_self( resource_name='worldwide-aggregated', @@ -166,6 +170,72 @@ Flow( "type": "integer" } ]), + checkpoint('processed_worldwide_data'), + # Create another resource with countries aggregated + duplicate( + source='time-series-19-covid-combined', + target_name='countries-aggregated', + target_path='data/countries-aggregated.csv' + ), + join_with_self( + resource_name='countries-aggregated', + join_key=['Date', 'Country/Region'], + fields=dict( + Date={ + 'name': 'Date' + }, + Country={ + 'name': 'Country/Region' + }, + Confirmed={ + 'name': 'Confirmed', + 'aggregate': 'sum' + }, + Recovered={ + 'name': 'Recovered', + 'aggregate': 'sum' + }, + Deaths={ + 'name': 'Deaths', + 'aggregate': 'sum' + } + ) + ), + update_schema('countries-aggregated', fields=[ + { + "format": "%Y-%m-%d", + "name": "Date", + "type": "date" + }, + { + "format": "default", + "name": "Country", + "type": "string" + }, + { + "format": "default", + "groupChar": "", + "name": "Confirmed", + "title": "Cumulative total confirmed cases to date", + "type": "integer" + }, + { + "format": "default", + "groupChar": "", + "name": "Recovered", + "title": "Cumulative total recovered cases to date", + "type": "integer" + }, + { + "format": "default", + "groupChar": "", + "name": "Deaths", + "title": "Cumulative total deaths to date", + "type": "integer" + } + ]), + checkpoint('processed_country_data'), + # Prepare data package (name, title) and add views update_package( name='covid-19', title='Novel Coronavirus 2019',