Browse Source

[Data]: Refactor to Combine confirmed, death and recovery cases + update data

master
zelima 4 years ago
parent
commit
c24f4d29e6
6 changed files with 9025 additions and 26895 deletions
  1. +22
    -8
      datapackage.json
  2. +47
    -19
      process.py
  3. +8956
    -0
      time-series-19-covid-combined.csv
  4. +0
    -8956
      time_series_19-covid-Confirmed.csv
  5. +0
    -8956
      time_series_19-covid-Deaths.csv
  6. +0
    -8956
      time_series_19-covid-Recovered.csv

+ 22
- 8
datapackage.json View File

@ -1,11 +1,11 @@
{ {
"bytes": 419065,
"bytes": 463524,
"count_of_rows": 8955, "count_of_rows": 8955,
"hash": "e96470246df7974be1ac04dc880b3799",
"hash": "2e74b71787289aac912b647688893625",
"profile": "data-package", "profile": "data-package",
"resources": [ "resources": [
{ {
"bytes": 419065,
"bytes": 463524,
"dialect": { "dialect": {
"caseSensitiveHeader": false, "caseSensitiveHeader": false,
"delimiter": ",", "delimiter": ",",
@ -17,9 +17,9 @@
}, },
"encoding": "utf-8", "encoding": "utf-8",
"format": "csv", "format": "csv",
"hash": "8697a80eb83feaf11c7041b2724eaa91",
"name": "time_series_19-covid-Recovered",
"path": "time_series_19-covid-Recovered.csv",
"hash": "88df9ccb75f4858ce93bc6607a9fefec",
"name": "time-series-19-covid-combined",
"path": "time-series-19-covid-combined.csv",
"profile": "tabular-data-resource", "profile": "tabular-data-resource",
"schema": { "schema": {
"fields": [ "fields": [
@ -49,14 +49,28 @@
}, },
{ {
"format": "%Y-%m-%d", "format": "%Y-%m-%d",
"name": "date",
"name": "Date",
"type": "date" "type": "date"
}, },
{ {
"decimalChar": ".", "decimalChar": ".",
"format": "default", "format": "default",
"groupChar": "", "groupChar": "",
"name": "case",
"name": "Confirmed",
"type": "number"
},
{
"decimalChar": ".",
"format": "default",
"groupChar": "",
"name": "Recovered",
"type": "number"
},
{
"decimalChar": ".",
"format": "default",
"groupChar": "",
"name": "Deaths",
"type": "number" "type": "number"
} }
], ],


+ 47
- 19
process.py View File

@ -1,6 +1,4 @@
import csv
from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path
import datapackage
from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_resource, join, add_computed_field, delete_fields
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/' BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
CONFIRMED = 'time_series_19-covid-Confirmed.csv' CONFIRMED = 'time_series_19-covid-Confirmed.csv'
@ -8,26 +6,56 @@ DEATH = 'time_series_19-covid-Deaths.csv'
RECOVERED = 'time_series_19-covid-Recovered.csv' RECOVERED = 'time_series_19-covid-Recovered.csv'
def to_normal_date(row): def to_normal_date(row):
old_date = row['date']
month, day, year = row['date'].split('-')
old_date = row['Date']
month, day, year = row['Date'].split('-')
day = f'0{day}' if len(day) == 1 else day day = f'0{day}' if len(day) == 1 else day
month = f'0{month}' if len(month) == 1 else month month = f'0{month}' if len(month) == 1 else month
row['date'] = '-'.join([day, month, year])
row['Date'] = '-'.join([day, month, year])
unpivoting_fields = [ unpivoting_fields = [
{ 'name': '([0-9]+\/[0-9]+\/[0-9]+)', 'keys': {'date': r'\1'} }
{ 'name': '([0-9]+\/[0-9]+\/[0-9]+)', 'keys': {'Date': r'\1'} }
] ]
extra_keys = [{'name': 'date', 'type': 'string'} ]
extra_value = {'name': 'case', 'type': 'string'}
extra_keys = [{'name': 'Date', 'type': 'string'} ]
extra_value = {'name': 'Case', 'type': 'number'}
for case in [CONFIRMED, DEATH, RECOVERED]:
Flow(
load(f'{BASE_URL}{case}'),
unpivot(unpivoting_fields, extra_keys, extra_value),
find_replace([{'name': 'date', 'patterns': [{'find': '/', 'replace': '-'}]}]),
to_normal_date,
set_type('date', type='date', format='%d-%m-%y'),
set_type('case', type='number'),
dump_to_path()
).results()[0]
Flow(
load(f'{BASE_URL}{CONFIRMED}'),
load(f'{BASE_URL}{RECOVERED}'),
load(f'{BASE_URL}{DEATH}'),
unpivot(unpivoting_fields, extra_keys, extra_value),
find_replace([{'name': 'Date', 'patterns': [{'find': '/', 'replace': '-'}]}]),
to_normal_date,
set_type('Date', type='date', format='%d-%m-%y', resources=None),
set_type('Case', type='number', resources=None),
join(
source_name='time_series_19-covid-Confirmed',
source_key=['Province/State', 'Date'],
source_delete=True,
target_name='time_series_19-covid-Deaths',
target_key=['Province/State', 'Date'],
fields=dict(Confirmed={
'name': 'Case',
'aggregate': 'first'
})
),
join(
source_name='time_series_19-covid-Recovered',
source_key=['Province/State', 'Date'],
source_delete=True,
target_name='time_series_19-covid-Deaths',
target_key=['Province/State', 'Date'],
fields=dict(Recovered={
'name': 'Case',
'aggregate': 'first'
})
),
add_computed_field(
target={'name': 'Deaths', 'type': 'number'},
operation='format',
with_='{Case}'
),
delete_fields(['Case']),
update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='time-series-19-covid-combined.csv'),
dump_to_path()
).results()[0]

+ 8956
- 0
time-series-19-covid-combined.csv
File diff suppressed because it is too large
View File


+ 0
- 8956
time_series_19-covid-Confirmed.csv
File diff suppressed because it is too large
View File


+ 0
- 8956
time_series_19-covid-Deaths.csv
File diff suppressed because it is too large
View File


+ 0
- 8956
time_series_19-covid-Recovered.csv
File diff suppressed because it is too large
View File


Loading…
Cancel
Save