COVID-19
/
covid19-data
mirror of https://github.com/gmarxcc/covid-19

from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, update_schema, join, join_with_self, add_computed_field, delete_fields, checkpoint, duplicate, filter_rows
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'CONFIRMED = 'time_series_covid19_confirmed_global.csv'DEATH = 'time_series_covid19_deaths_global.csv'RECOVERED = 'time_series_19-covid-Recovered.csv'
def to_normal_date(row):    old_date = row['Date']    month, day, year = row['Date'].split('-')    day = f'0{day}' if len(day) == 1 else day    month = f'0{month}' if len(month) == 1 else month    row['Date'] = '-'.join([day, month, year])
unpivoting_fields = [    { 'name': '([0-9]+\/[0-9]+\/[0-9]+)', 'keys': {'Date': r'\1'} }]
extra_keys = [{'name': 'Date', 'type': 'string'} ]extra_value = {'name': 'Case', 'type': 'number'}

def pivot_key_countries(package):    key_countries = ['China', 'US', 'United_Kingdom', 'Italy', 'France', 'Germany', 'Spain', 'Iran']    for country in key_countries:        package.pkg.descriptor['resources'][1]['schema']['fields'].append(dict(            name=country,            type='integer',            title='Cumulative total confirmed cases to date.'        ))    yield package.pkg    resources = iter(package)
    data_by_province = next(resources)    yield data_by_province
    data_by_key_countries = next(resources)    def process_rows(rows):        new_row = dict(Date=None, China=None, US=None, United_Kingdom=None, Italy=None, France=None, Germany=None, Spain=None, Iran=None)        for row in rows:            country = row['Country'].replace(' ', '_')            if country in key_countries:                new_row['Date'] = row['Date']                new_row[country] = row['Confirmed']            if None not in new_row.values():                yield new_row                new_row = dict(Date=None, China=None, US=None, United_Kingdom=None, Italy=None, France=None, Germany=None, Spain=None, Iran=None)
    yield process_rows(data_by_key_countries)
    data_by_country = next(resources)    yield data_by_country
    worldwide = next(resources)    yield worldwide
Flow(      load(f'{BASE_URL}{CONFIRMED}'),      load(f'{BASE_URL}{RECOVERED}'),      load(f'{BASE_URL}{DEATH}'),      checkpoint('load_data'),      unpivot(unpivoting_fields, extra_keys, extra_value),      find_replace([{'name': 'Date', 'patterns': [{'find': '/', 'replace': '-'}]}]),      to_normal_date,      set_type('Date', type='date', format='%d-%m-%y', resources=None),      set_type('Case', type='number', resources=None),      join(        source_name='time_series_covid19_confirmed_global',        source_key=['Province/State', 'Country/Region', 'Date'],        source_delete=True,        target_name='time_series_covid19_deaths_global',        target_key=['Province/State', 'Country/Region', 'Date'],        fields=dict(Confirmed={            'name': 'Case',            'aggregate': 'first'        })      ),      join(        source_name='time_series_19-covid-Recovered',        source_key=['Province/State', 'Country/Region', 'Date'],        source_delete=True,        target_name='time_series_covid19_deaths_global',        target_key=['Province/State', 'Country/Region', 'Date'],        fields=dict(Recovered={            'name': 'Case',            'aggregate': 'first'        })      ),      add_computed_field(        target={'name': 'Deaths', 'type': 'number'},        operation='format',        with_='{Case}'      ),      delete_fields(['Case']),      update_resource('time_series_covid19_deaths_global', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'),      update_schema('time-series-19-covid-combined', missingValues=['None', ''], fields=[        {        "format": "%Y-%m-%d",        "name": "Date",        "type": "date"        },        {          "format": "default",          "name": "Country/Region",          "type": "string"        },        {          "format": "default",          "name": "Province/State",          "type": "string"        },        {          "decimalChar": ".",          "format": "default",          "groupChar": "",          "name": "Lat",          "type": "number"        },        {          "decimalChar": ".",          "format": "default",          "groupChar": "",          "name": "Long",          "type": "number"        },        {          "format": "default",          "groupChar": "",          "name": "Confirmed",          "title": "Cumulative total confirmed cases to date",          "type": "integer"        },        {          "format": "default",          "groupChar": "",          "name": "Recovered",          "title": "Cumulative total recovered cases to date",          "type": "integer"        },        {          "format": "default",          "groupChar": "",          "name": "Deaths",          "title": "Cumulative total deaths to date",          "type": "integer"        }      ]),      checkpoint('processed_data'),      # Duplicate the stream to create aggregated data      duplicate(        source='time-series-19-covid-combined',        target_name='worldwide-aggregated',        target_path='data/worldwide-aggregated.csv'      ),      join_with_self(        resource_name='worldwide-aggregated',        join_key=['Date'],        fields=dict(            Date={                'name': 'Date'            },            Confirmed={                'name': 'Confirmed',                'aggregate': 'sum'            },            Recovered={                'name': 'Recovered',                'aggregate': 'sum'            },            Deaths={                'name': 'Deaths',                'aggregate': 'sum'            }        )      ),      update_schema('worldwide-aggregated', missingValues=['None', ''], fields=[        {          "format": "%Y-%m-%d",          "name": "Date",          "type": "date"        },        {          "format": "default",          "groupChar": "",          "name": "Confirmed",          "title": "Cumulative total confirmed cases to date",          "type": "integer"        },        {          "format": "default",          "groupChar": "",          "name": "Recovered",          "title": "Cumulative total recovered cases to date",          "type": "integer"        },        {          "format": "default",          "groupChar": "",          "name": "Deaths",          "title": "Cumulative total deaths to date",          "type": "integer"        }      ]),      checkpoint('processed_worldwide_data'),      # Create another resource with key countries pivoted      duplicate(        source='time-series-19-covid-combined',        target_name='key-countries-pivoted',        target_path='data/key-countries-pivoted.csv'      ),      join_with_self(        resource_name='key-countries-pivoted',        join_key=['Date', 'Country/Region'],        fields=dict(            Date={                'name': 'Date'            },            Country={                'name': 'Country/Region'            },            Confirmed={                'name': 'Confirmed',                'aggregate': 'sum'            },            Recovered={                'name': 'Recovered',                'aggregate': 'sum'            },            Deaths={                'name': 'Deaths',                'aggregate': 'sum'            }        )      ),      update_schema('key-countries-pivoted', missingValues=['None', ''], fields=[        {          "format": "%Y-%m-%d",          "name": "Date",          "type": "date"        },        {          "format": "default",          "name": "Country",          "type": "string"        },        {          "format": "default",          "groupChar": "",          "name": "Confirmed",          "title": "Cumulative total confirmed cases to date",          "type": "integer"        },        {          "format": "default",          "groupChar": "",          "name": "Recovered",          "title": "Cumulative total recovered cases to date",          "type": "integer"        },        {          "format": "default",          "groupChar": "",          "name": "Deaths",          "title": "Cumulative total deaths to date",          "type": "integer"        }      ]),      checkpoint('processed_country_data'),      # All countries aggregated      duplicate(        source='key-countries-pivoted',        target_name='countries-aggregated',        target_path='data/countries-aggregated.csv'      ),      pivot_key_countries,      delete_fields(['Country', 'Confirmed', 'Recovered', 'Deaths'], resources='key-countries-pivoted'),      # Prepare data package (name, title) and add views      update_package(        name='covid-19',        title='Novel Coronavirus 2019',        views=[            {              "title": "Total world to date",              "resources": ["worldwide-aggregated"],              "specType": "simple",              "spec": {                "group": "Date",                "series": ["Confirmed", "Recovered", "Deaths"],                "type": "line"              }            },            {                "title": "Number of confirmed cases in key countries",                "resources": ["key-countries-pivoted"],                "specType": "simple",                "spec": {                    "group": "Date",                    "series": ["China", "US", "United_Kingdom", "Italy", "France", "Germany", "Spain", "Iran"],                    "type": "line"                }            }        ]      ),      dump_to_path()).results()[0]