You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

305 lines
9.6 KiB

  1. from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, update_schema, join, join_with_self, add_computed_field, delete_fields, checkpoint, duplicate, filter_rows
  2. BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
  3. CONFIRMED = 'time_series_covid19_confirmed_global.csv'
  4. DEATH = 'time_series_covid19_deaths_global.csv'
  5. RECOVERED = 'time_series_19-covid-Recovered.csv'
  6. def to_normal_date(row):
  7. old_date = row['Date']
  8. month, day, year = row['Date'].split('-')
  9. day = f'0{day}' if len(day) == 1 else day
  10. month = f'0{month}' if len(month) == 1 else month
  11. row['Date'] = '-'.join([day, month, year])
  12. unpivoting_fields = [
  13. { 'name': '([0-9]+\/[0-9]+\/[0-9]+)', 'keys': {'Date': r'\1'} }
  14. ]
  15. extra_keys = [{'name': 'Date', 'type': 'string'} ]
  16. extra_value = {'name': 'Case', 'type': 'number'}
  17. def pivot_key_countries(package):
  18. key_countries = ['China', 'US', 'United_Kingdom', 'Italy', 'France', 'Germany', 'Spain', 'Iran']
  19. for country in key_countries:
  20. package.pkg.descriptor['resources'][1]['schema']['fields'].append(dict(
  21. name=country,
  22. type='integer',
  23. title='Cumulative total confirmed cases to date.'
  24. ))
  25. yield package.pkg
  26. resources = iter(package)
  27. data_by_province = next(resources)
  28. yield data_by_province
  29. data_by_key_countries = next(resources)
  30. def process_rows(rows):
  31. new_row = dict(Date=None, China=None, US=None, United_Kingdom=None, Italy=None, France=None, Germany=None, Spain=None, Iran=None)
  32. for row in rows:
  33. country = row['Country'].replace(' ', '_')
  34. if country in key_countries:
  35. new_row['Date'] = row['Date']
  36. new_row[country] = row['Confirmed']
  37. if None not in new_row.values():
  38. yield new_row
  39. new_row = dict(Date=None, China=None, US=None, United_Kingdom=None, Italy=None, France=None, Germany=None, Spain=None, Iran=None)
  40. yield process_rows(data_by_key_countries)
  41. data_by_country = next(resources)
  42. yield data_by_country
  43. worldwide = next(resources)
  44. yield worldwide
  45. Flow(
  46. load(f'{BASE_URL}{CONFIRMED}'),
  47. load(f'{BASE_URL}{RECOVERED}'),
  48. load(f'{BASE_URL}{DEATH}'),
  49. checkpoint('load_data'),
  50. unpivot(unpivoting_fields, extra_keys, extra_value),
  51. find_replace([{'name': 'Date', 'patterns': [{'find': '/', 'replace': '-'}]}]),
  52. to_normal_date,
  53. set_type('Date', type='date', format='%d-%m-%y', resources=None),
  54. set_type('Case', type='number', resources=None),
  55. join(
  56. source_name='time_series_covid19_confirmed_global',
  57. source_key=['Province/State', 'Country/Region', 'Date'],
  58. source_delete=True,
  59. target_name='time_series_covid19_deaths_global',
  60. target_key=['Province/State', 'Country/Region', 'Date'],
  61. fields=dict(Confirmed={
  62. 'name': 'Case',
  63. 'aggregate': 'first'
  64. })
  65. ),
  66. join(
  67. source_name='time_series_19-covid-Recovered',
  68. source_key=['Province/State', 'Country/Region', 'Date'],
  69. source_delete=True,
  70. target_name='time_series_covid19_deaths_global',
  71. target_key=['Province/State', 'Country/Region', 'Date'],
  72. fields=dict(Recovered={
  73. 'name': 'Case',
  74. 'aggregate': 'first'
  75. })
  76. ),
  77. add_computed_field(
  78. target={'name': 'Deaths', 'type': 'number'},
  79. operation='format',
  80. with_='{Case}'
  81. ),
  82. delete_fields(['Case']),
  83. update_resource('time_series_covid19_deaths_global', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'),
  84. update_schema('time-series-19-covid-combined', missingValues=['None', ''], fields=[
  85. {
  86. "format": "%Y-%m-%d",
  87. "name": "Date",
  88. "type": "date"
  89. },
  90. {
  91. "format": "default",
  92. "name": "Country/Region",
  93. "type": "string"
  94. },
  95. {
  96. "format": "default",
  97. "name": "Province/State",
  98. "type": "string"
  99. },
  100. {
  101. "decimalChar": ".",
  102. "format": "default",
  103. "groupChar": "",
  104. "name": "Lat",
  105. "type": "number"
  106. },
  107. {
  108. "decimalChar": ".",
  109. "format": "default",
  110. "groupChar": "",
  111. "name": "Long",
  112. "type": "number"
  113. },
  114. {
  115. "format": "default",
  116. "groupChar": "",
  117. "name": "Confirmed",
  118. "title": "Cumulative total confirmed cases to date",
  119. "type": "integer"
  120. },
  121. {
  122. "format": "default",
  123. "groupChar": "",
  124. "name": "Recovered",
  125. "title": "Cumulative total recovered cases to date",
  126. "type": "integer"
  127. },
  128. {
  129. "format": "default",
  130. "groupChar": "",
  131. "name": "Deaths",
  132. "title": "Cumulative total deaths to date",
  133. "type": "integer"
  134. }
  135. ]),
  136. checkpoint('processed_data'),
  137. # Duplicate the stream to create aggregated data
  138. duplicate(
  139. source='time-series-19-covid-combined',
  140. target_name='worldwide-aggregated',
  141. target_path='data/worldwide-aggregated.csv'
  142. ),
  143. join_with_self(
  144. resource_name='worldwide-aggregated',
  145. join_key=['Date'],
  146. fields=dict(
  147. Date={
  148. 'name': 'Date'
  149. },
  150. Confirmed={
  151. 'name': 'Confirmed',
  152. 'aggregate': 'sum'
  153. },
  154. Recovered={
  155. 'name': 'Recovered',
  156. 'aggregate': 'sum'
  157. },
  158. Deaths={
  159. 'name': 'Deaths',
  160. 'aggregate': 'sum'
  161. }
  162. )
  163. ),
  164. update_schema('worldwide-aggregated', missingValues=['None', ''], fields=[
  165. {
  166. "format": "%Y-%m-%d",
  167. "name": "Date",
  168. "type": "date"
  169. },
  170. {
  171. "format": "default",
  172. "groupChar": "",
  173. "name": "Confirmed",
  174. "title": "Cumulative total confirmed cases to date",
  175. "type": "integer"
  176. },
  177. {
  178. "format": "default",
  179. "groupChar": "",
  180. "name": "Recovered",
  181. "title": "Cumulative total recovered cases to date",
  182. "type": "integer"
  183. },
  184. {
  185. "format": "default",
  186. "groupChar": "",
  187. "name": "Deaths",
  188. "title": "Cumulative total deaths to date",
  189. "type": "integer"
  190. }
  191. ]),
  192. checkpoint('processed_worldwide_data'),
  193. # Create another resource with key countries pivoted
  194. duplicate(
  195. source='time-series-19-covid-combined',
  196. target_name='key-countries-pivoted',
  197. target_path='data/key-countries-pivoted.csv'
  198. ),
  199. join_with_self(
  200. resource_name='key-countries-pivoted',
  201. join_key=['Date', 'Country/Region'],
  202. fields=dict(
  203. Date={
  204. 'name': 'Date'
  205. },
  206. Country={
  207. 'name': 'Country/Region'
  208. },
  209. Confirmed={
  210. 'name': 'Confirmed',
  211. 'aggregate': 'sum'
  212. },
  213. Recovered={
  214. 'name': 'Recovered',
  215. 'aggregate': 'sum'
  216. },
  217. Deaths={
  218. 'name': 'Deaths',
  219. 'aggregate': 'sum'
  220. }
  221. )
  222. ),
  223. update_schema('key-countries-pivoted', missingValues=['None', ''], fields=[
  224. {
  225. "format": "%Y-%m-%d",
  226. "name": "Date",
  227. "type": "date"
  228. },
  229. {
  230. "format": "default",
  231. "name": "Country",
  232. "type": "string"
  233. },
  234. {
  235. "format": "default",
  236. "groupChar": "",
  237. "name": "Confirmed",
  238. "title": "Cumulative total confirmed cases to date",
  239. "type": "integer"
  240. },
  241. {
  242. "format": "default",
  243. "groupChar": "",
  244. "name": "Recovered",
  245. "title": "Cumulative total recovered cases to date",
  246. "type": "integer"
  247. },
  248. {
  249. "format": "default",
  250. "groupChar": "",
  251. "name": "Deaths",
  252. "title": "Cumulative total deaths to date",
  253. "type": "integer"
  254. }
  255. ]),
  256. checkpoint('processed_country_data'),
  257. # All countries aggregated
  258. duplicate(
  259. source='key-countries-pivoted',
  260. target_name='countries-aggregated',
  261. target_path='data/countries-aggregated.csv'
  262. ),
  263. pivot_key_countries,
  264. delete_fields(['Country', 'Confirmed', 'Recovered', 'Deaths'], resources='key-countries-pivoted'),
  265. # Prepare data package (name, title) and add views
  266. update_package(
  267. name='covid-19',
  268. title='Novel Coronavirus 2019',
  269. views=[
  270. {
  271. "title": "Total world to date",
  272. "resources": ["worldwide-aggregated"],
  273. "specType": "simple",
  274. "spec": {
  275. "group": "Date",
  276. "series": ["Confirmed", "Recovered", "Deaths"],
  277. "type": "line"
  278. }
  279. },
  280. {
  281. "title": "Number of confirmed cases in key countries",
  282. "resources": ["key-countries-pivoted"],
  283. "specType": "simple",
  284. "spec": {
  285. "group": "Date",
  286. "series": ["China", "US", "United_Kingdom", "Italy", "France", "Germany", "Spain", "Iran"],
  287. "type": "line"
  288. }
  289. }
  290. ]
  291. ),
  292. dump_to_path()
  293. ).results()[0]