Browse Source

Merge pull request #18 from datasets/feature/datapackage-improvements

Data Package improvements
master
Anuar Ustayev 4 years ago
committed by GitHub
parent
commit
72fd6a2440
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 147 additions and 7 deletions
  1. +3
    -0
      .gitignore
  2. +0
    -0
      data/time-series-19-covid-combined.csv
  3. +61
    -0
      data/worldwide-aggregated.csv
  4. +50
    -5
      datapackage.json
  5. +33
    -2
      process.py

+ 3
- 0
.gitignore View File

@ -144,3 +144,6 @@ static
# put you virtual env names here
env36/
# dataflows checkpoints
.checkpoints/

time-series-19-covid-combined.csv → data/time-series-19-covid-combined.csv View File


+ 61
- 0
data/worldwide-aggregated.csv View File

@ -0,0 +1,61 @@
Confirmed,Date,Deaths,Recovered
555,2020-01-22,17,28
653,2020-01-23,18,30
941,2020-01-24,26,36
1434,2020-01-25,42,39
2118,2020-01-26,56,52
2927,2020-01-27,82,61
5578,2020-01-28,131,107
6166,2020-01-29,133,126
8234,2020-01-30,171,143
9927,2020-01-31,213,222
12038,2020-02-01,259,284
16787,2020-02-02,362,472
19881,2020-02-03,426,623
23892,2020-02-04,492,852
27635,2020-02-05,564,1124
30817,2020-02-06,634,1487
34391,2020-02-07,719,2011
37120,2020-02-08,806,2616
40150,2020-02-09,906,3244
42762,2020-02-10,1013,3946
44802,2020-02-11,1113,4683
45221,2020-02-12,1118,5150
60368,2020-02-13,1371,6295
66885,2020-02-14,1523,8058
69030,2020-02-15,1666,9395
71224,2020-02-16,1770,10865
73258,2020-02-17,1868,12583
75136,2020-02-18,2007,14352
75639,2020-02-19,2122,16121
76197,2020-02-20,2247,18177
76823,2020-02-21,2251,18890
78579,2020-02-22,2458,22886
78965,2020-02-23,2469,23394
79568,2020-02-24,2629,25227
80413,2020-02-25,2708,27905
81395,2020-02-26,2770,30384
82754,2020-02-27,2814,33277
84120,2020-02-28,2872,36711
86011,2020-02-29,2941,39782
88369,2020-03-01,2996,42716
90306,2020-03-02,3085,45602
92840,2020-03-03,3160,48228
95120,2020-03-04,3254,51170
97882,2020-03-05,3348,53796
101784,2020-03-06,3460,55865
105821,2020-03-07,3558,58358
109795,2020-03-08,3802,60694
113561,2020-03-09,3988,62494
118592,2020-03-10,4262,64404
125865,2020-03-11,4615,67003
128343,2020-03-12,4720,68324
145193,2020-03-13,5404,70251
156094,2020-03-14,5819,72624
167446,2020-03-15,6440,76034
181527,2020-03-16,7126,78088
197142,2020-03-17,7905,80840
214910,2020-03-18,8733,83207
242708,2020-03-19,9867,84854
272166,2020-03-20,11299,87256
304524,2020-03-21,12973,91499

+ 50
- 5
datapackage.json View File

@ -1,7 +1,8 @@
{
"bytes": 1465368,
"count_of_rows": 28920,
"hash": "0645dcfca5d10e757252920dbcdbe67d",
"bytes": 1467076,
"count_of_rows": 28980,
"hash": "6e25a726932e6dabaaa8b8e5ed578dd7",
"name": "covid-19",
"profile": "data-package",
"resources": [
{
@ -19,7 +20,7 @@
"format": "csv",
"hash": "7b65c350a6e621e770bdce075df8b657",
"name": "time-series-19-covid-combined",
"path": "time-series-19-covid-combined.csv",
"path": "data/time-series-19-covid-combined.csv",
"profile": "tabular-data-resource",
"schema": {
"fields": [
@ -78,6 +79,50 @@
""
]
}
},
{
"bytes": 1708,
"dialect": {
"delimiter": ",",
"doubleQuote": true,
"lineTerminator": "\r\n",
"quoteChar": "\"",
"skipInitialSpace": false
},
"encoding": "utf-8",
"format": "csv",
"hash": "f261b5b527542ceace4f2c7941d69e40",
"name": "worldwide-aggregated",
"path": "data/worldwide-aggregated.csv",
"profile": "data-resource",
"schema": {
"fields": [
{
"decimalChar": ".",
"groupChar": "",
"name": "Confirmed",
"type": "number"
},
{
"format": "%Y-%m-%d",
"name": "Date",
"type": "date"
},
{
"decimalChar": ".",
"groupChar": "",
"name": "Deaths",
"type": "number"
},
{
"decimalChar": ".",
"groupChar": "",
"name": "Recovered",
"type": "number"
}
]
}
}
]
],
"title": "Novel Coronavirus 2019"
}

+ 33
- 2
process.py View File

@ -1,4 +1,4 @@
from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_resource, join, add_computed_field, delete_fields
from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, join, join_with_self, add_computed_field, delete_fields, checkpoint, duplicate
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
CONFIRMED = 'time_series_19-covid-Confirmed.csv'
@ -23,6 +23,7 @@ Flow(
load(f'{BASE_URL}{CONFIRMED}'),
load(f'{BASE_URL}{RECOVERED}'),
load(f'{BASE_URL}{DEATH}'),
checkpoint('load_data'),
unpivot(unpivoting_fields, extra_keys, extra_value),
find_replace([{'name': 'Date', 'patterns': [{'find': '/', 'replace': '-'}]}]),
to_normal_date,
@ -56,6 +57,36 @@ Flow(
with_='{Case}'
),
delete_fields(['Case']),
update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='time-series-19-covid-combined.csv'),
update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'),
update_package(name='covid-19', title='Novel Coronavirus 2019'),
dump_to_path(),
checkpoint('processed_data'),
# Duplicate the stream to create aggregated data
duplicate(
source='time-series-19-covid-combined',
target_name='worldwide-aggregated',
target_path='worldwide-aggregated.csv'
),
join_with_self(
resource_name='worldwide-aggregated',
join_key=['Date'],
fields=dict(
Date={
'name': 'Date'
},
Confirmed={
'name': 'Confirmed',
'aggregate': 'sum'
},
Recovered={
'name': 'Recovered',
'aggregate': 'sum'
},
Deaths={
'name': 'Deaths',
'aggregate': 'sum'
}
)
),
dump_to_path()
).results()[0]

Loading…
Cancel
Save