Browse Source

[data][xl]: added a new resource with key countries and pivoted it to make it easy to vizualize.

master
anuveyatsu 4 years ago
parent
commit
ad6dd56ddd
3 changed files with 182 additions and 11 deletions
  1. +63
    -0
      data/key-countries-pivoted.csv
  2. +72
    -3
      datapackage.json
  3. +47
    -8
      process.py

+ 63
- 0
data/key-countries-pivoted.csv View File

@ -0,0 +1,63 @@
Date,China,US,United_Kingdom,Italy,France,Germany,Spain,Iran
2020-01-22,548,1,0,0,0,0,0,0
2020-01-23,643,1,0,0,0,0,0,0
2020-01-24,920,2,0,0,2,0,0,0
2020-01-25,1406,2,0,0,3,0,0,0
2020-01-26,2075,5,0,0,3,0,0,0
2020-01-27,2877,5,0,0,3,1,0,0
2020-01-28,5509,5,0,0,4,4,0,0
2020-01-29,6087,5,0,0,5,4,0,0
2020-01-30,8141,5,0,0,5,4,0,0
2020-01-31,9802,7,2,2,5,5,0,0
2020-02-01,11891,8,2,2,6,8,1,0
2020-02-02,16630,8,2,2,6,10,1,0
2020-02-03,19716,11,2,2,6,12,1,0
2020-02-04,23707,11,2,2,6,12,1,0
2020-02-05,27440,11,2,2,6,12,1,0
2020-02-06,30587,11,2,2,6,12,1,0
2020-02-07,34110,11,3,3,6,13,1,0
2020-02-08,36814,11,3,3,11,13,1,0
2020-02-09,39829,11,3,3,11,14,2,0
2020-02-10,42354,11,8,3,11,14,2,0
2020-02-11,44386,12,8,3,11,16,2,0
2020-02-12,44759,12,9,3,11,16,2,0
2020-02-13,59895,13,9,3,11,16,2,0
2020-02-14,66358,13,9,3,11,16,2,0
2020-02-15,68413,13,9,3,12,16,2,0
2020-02-16,70513,13,9,3,12,16,2,0
2020-02-17,72434,13,9,3,12,16,2,0
2020-02-18,74211,13,9,3,12,16,2,0
2020-02-19,74619,13,9,3,12,16,2,2
2020-02-20,75077,13,9,3,12,16,2,5
2020-02-21,75550,15,9,20,12,16,2,18
2020-02-22,77001,15,9,62,12,16,2,28
2020-02-23,77022,15,9,155,12,16,2,43
2020-02-24,77241,51,13,229,12,16,2,61
2020-02-25,77754,51,13,322,14,17,6,95
2020-02-26,78166,57,13,453,18,27,13,139
2020-02-27,78600,58,15,655,38,46,15,245
2020-02-28,78928,60,20,888,57,48,32,388
2020-02-29,79356,68,23,1128,100,79,45,593
2020-03-01,79932,74,36,1694,130,130,84,978
2020-03-02,80136,98,40,2036,191,159,120,1501
2020-03-03,80261,118,51,2502,204,196,165,2336
2020-03-04,80386,149,86,3089,288,262,222,2922
2020-03-05,80537,217,116,3858,380,482,259,3513
2020-03-06,80690,262,164,4636,656,670,400,4747
2020-03-07,80770,402,207,5883,959,799,500,5823
2020-03-08,80823,518,274,7375,1136,1040,673,6566
2020-03-09,80860,583,322,9172,1219,1176,1073,7161
2020-03-10,80887,959,384,10149,1794,1457,1695,8042
2020-03-11,80921,1281,459,12462,2293,1908,2277,9000
2020-03-12,80932,1663,459,12462,2293,2078,2277,10075
2020-03-13,80945,2179,802,17660,3681,3675,5232,11364
2020-03-14,80977,2727,1144,21157,4496,4585,6391,12729
2020-03-15,81003,3499,1145,24747,4532,5795,7798,13938
2020-03-16,81033,4632,1551,27980,6683,7272,9942,14991
2020-03-17,81058,6421,1960,31506,7715,9257,11748,16169
2020-03-18,81102,7783,2642,35713,9124,12327,13910,17361
2020-03-19,81156,13677,2716,41035,10970,15320,17963,18407
2020-03-20,81250,19100,4014,47021,12758,19848,20410,19644
2020-03-21,81305,25489,5067,53578,14463,22213,25374,20610
2020-03-22,81435,33276,5745,59138,16243,24873,28768,21638
2020-03-23,81498,43847,6726,63927,20123,29056,35136,23049

+ 72
- 3
datapackage.json View File

@ -1,7 +1,7 @@
{
"bytes": 976643,
"count_of_rows": 25234,
"hash": "9bc713d601a6c93496187964b2922be5",
"bytes": 979273,
"count_of_rows": 25296,
"hash": "6c8d26b210a9141fbcd24652a104fae1",
"name": "covid-19",
"profile": "data-package",
"resources": [
@ -81,6 +81,75 @@
]
}
},
{
"bytes": 2630,
"dialect": {
"delimiter": ",",
"doubleQuote": true,
"lineTerminator": "\r\n",
"quoteChar": "\"",
"skipInitialSpace": false
},
"encoding": "utf-8",
"format": "csv",
"hash": "8928741fc196e6f76113ed59b080b753",
"name": "key-countries-pivoted",
"path": "data/key-countries-pivoted.csv",
"profile": "data-resource",
"schema": {
"fields": [
{
"format": "%Y-%m-%d",
"name": "Date",
"type": "date"
},
{
"name": "China",
"title": "Cumulative total confirmed cases to date.",
"type": "integer"
},
{
"name": "US",
"title": "Cumulative total confirmed cases to date.",
"type": "integer"
},
{
"name": "United_Kingdom",
"title": "Cumulative total confirmed cases to date.",
"type": "integer"
},
{
"name": "Italy",
"title": "Cumulative total confirmed cases to date.",
"type": "integer"
},
{
"name": "France",
"title": "Cumulative total confirmed cases to date.",
"type": "integer"
},
{
"name": "Germany",
"title": "Cumulative total confirmed cases to date.",
"type": "integer"
},
{
"name": "Spain",
"title": "Cumulative total confirmed cases to date.",
"type": "integer"
},
{
"name": "Iran",
"title": "Cumulative total confirmed cases to date.",
"type": "integer"
}
],
"missingValues": [
"None",
""
]
}
},
{
"bytes": 289357,
"dialect": {


+ 47
- 8
process.py View File

@ -19,9 +19,40 @@ unpivoting_fields = [
extra_keys = [{'name': 'Date', 'type': 'string'} ]
extra_value = {'name': 'Case', 'type': 'number'}
def is_key_country(row):
key_countries = ['Chine', 'US', 'United Kingdom', 'Italy', 'France', 'Germany']
return row['Country'] in key_countries
def pivot_key_countries(package):
key_countries = ['China', 'US', 'United_Kingdom', 'Italy', 'France', 'Germany', 'Spain', 'Iran']
for country in key_countries:
package.pkg.descriptor['resources'][1]['schema']['fields'].append(dict(
name=country,
type='integer',
title='Cumulative total confirmed cases to date.'
))
yield package.pkg
resources = iter(package)
data_by_province = next(resources)
yield data_by_province
data_by_key_countries = next(resources)
def process_rows(rows):
new_row = dict(Date=None, China=None, US=None, United_Kingdom=None, Italy=None, France=None, Germany=None, Spain=None, Iran=None)
for row in rows:
country = row['Country'].replace(' ', '_')
if country in key_countries:
new_row['Date'] = row['Date']
new_row[country] = row['Confirmed']
if None not in new_row.values():
yield new_row
new_row = dict(Date=None, China=None, US=None, United_Kingdom=None, Italy=None, France=None, Germany=None, Spain=None, Iran=None)
yield process_rows(data_by_key_countries)
data_by_country = next(resources)
yield data_by_country
worldwide = next(resources)
yield worldwide
Flow(
load(f'{BASE_URL}{CONFIRMED}'),
@ -171,14 +202,14 @@ Flow(
}
]),
checkpoint('processed_worldwide_data'),
# Create another resource with countries aggregated
# Create another resource with key countries pivoted
duplicate(
source='time-series-19-covid-combined',
target_name='countries-aggregated',
target_path='data/countries-aggregated.csv'
target_name='key-countries-pivoted',
target_path='data/key-countries-pivoted.csv'
),
join_with_self(
resource_name='countries-aggregated',
resource_name='key-countries-pivoted',
join_key=['Date', 'Country/Region'],
fields=dict(
Date={
@ -201,7 +232,7 @@ Flow(
}
)
),
update_schema('countries-aggregated', missingValues=['None', ''], fields=[
update_schema('key-countries-pivoted', missingValues=['None', ''], fields=[
{
"format": "%Y-%m-%d",
"name": "Date",
@ -235,6 +266,14 @@ Flow(
}
]),
checkpoint('processed_country_data'),
# All countries aggregated
duplicate(
source='key-countries-pivoted',
target_name='countries-aggregated',
target_path='data/countries-aggregated.csv'
),
pivot_key_countries,
delete_fields(['Country', 'Confirmed', 'Recovered', 'Deaths'], resources='key-countries-pivoted'),
# Prepare data package (name, title) and add views
update_package(
name='covid-19',


Loading…
Cancel
Save