You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

256 lines
7.7 KiB

  1. from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_package, update_resource, update_schema, join, join_with_self, add_computed_field, delete_fields, checkpoint, duplicate, filter_rows
  2. BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
  3. CONFIRMED = 'time_series_19-covid-Confirmed.csv'
  4. DEATH = 'time_series_19-covid-Deaths.csv'
  5. RECOVERED = 'time_series_19-covid-Recovered.csv'
  6. def to_normal_date(row):
  7. old_date = row['Date']
  8. month, day, year = row['Date'].split('-')
  9. day = f'0{day}' if len(day) == 1 else day
  10. month = f'0{month}' if len(month) == 1 else month
  11. row['Date'] = '-'.join([day, month, year])
  12. unpivoting_fields = [
  13. { 'name': '([0-9]+\/[0-9]+\/[0-9]+)', 'keys': {'Date': r'\1'} }
  14. ]
  15. extra_keys = [{'name': 'Date', 'type': 'string'} ]
  16. extra_value = {'name': 'Case', 'type': 'number'}
  17. def is_key_country(row):
  18. key_countries = ['Chine', 'US', 'United Kingdom', 'Italy', 'France', 'Germany']
  19. return row['Country'] in key_countries
  20. Flow(
  21. load(f'{BASE_URL}{CONFIRMED}'),
  22. load(f'{BASE_URL}{RECOVERED}'),
  23. load(f'{BASE_URL}{DEATH}'),
  24. checkpoint('load_data'),
  25. unpivot(unpivoting_fields, extra_keys, extra_value),
  26. find_replace([{'name': 'Date', 'patterns': [{'find': '/', 'replace': '-'}]}]),
  27. to_normal_date,
  28. set_type('Date', type='date', format='%d-%m-%y', resources=None),
  29. set_type('Case', type='number', resources=None),
  30. join(
  31. source_name='time_series_19-covid-Confirmed',
  32. source_key=['Province/State', 'Country/Region', 'Date'],
  33. source_delete=True,
  34. target_name='time_series_19-covid-Deaths',
  35. target_key=['Province/State', 'Country/Region', 'Date'],
  36. fields=dict(Confirmed={
  37. 'name': 'Case',
  38. 'aggregate': 'first'
  39. })
  40. ),
  41. join(
  42. source_name='time_series_19-covid-Recovered',
  43. source_key=['Province/State', 'Country/Region', 'Date'],
  44. source_delete=True,
  45. target_name='time_series_19-covid-Deaths',
  46. target_key=['Province/State', 'Country/Region', 'Date'],
  47. fields=dict(Recovered={
  48. 'name': 'Case',
  49. 'aggregate': 'first'
  50. })
  51. ),
  52. add_computed_field(
  53. target={'name': 'Deaths', 'type': 'number'},
  54. operation='format',
  55. with_='{Case}'
  56. ),
  57. delete_fields(['Case']),
  58. update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='data/time-series-19-covid-combined.csv'),
  59. update_schema('time-series-19-covid-combined', missingValues=['None', None, ''], fields=[
  60. {
  61. "format": "%Y-%m-%d",
  62. "name": "Date",
  63. "type": "date"
  64. },
  65. {
  66. "format": "default",
  67. "name": "Country/Region",
  68. "type": "string"
  69. },
  70. {
  71. "format": "default",
  72. "name": "Province/State",
  73. "type": "string"
  74. },
  75. {
  76. "decimalChar": ".",
  77. "format": "default",
  78. "groupChar": "",
  79. "name": "Lat",
  80. "type": "number"
  81. },
  82. {
  83. "decimalChar": ".",
  84. "format": "default",
  85. "groupChar": "",
  86. "name": "Long",
  87. "type": "number"
  88. },
  89. {
  90. "format": "default",
  91. "groupChar": "",
  92. "name": "Confirmed",
  93. "title": "Cumulative total confirmed cases to date",
  94. "type": "integer"
  95. },
  96. {
  97. "format": "default",
  98. "groupChar": "",
  99. "name": "Recovered",
  100. "title": "Cumulative total recovered cases to date",
  101. "type": "integer"
  102. },
  103. {
  104. "format": "default",
  105. "groupChar": "",
  106. "name": "Deaths",
  107. "title": "Cumulative total deaths to date",
  108. "type": "integer"
  109. }
  110. ]),
  111. checkpoint('processed_data'),
  112. # Duplicate the stream to create aggregated data
  113. duplicate(
  114. source='time-series-19-covid-combined',
  115. target_name='worldwide-aggregated',
  116. target_path='data/worldwide-aggregated.csv'
  117. ),
  118. join_with_self(
  119. resource_name='worldwide-aggregated',
  120. join_key=['Date'],
  121. fields=dict(
  122. Date={
  123. 'name': 'Date'
  124. },
  125. Confirmed={
  126. 'name': 'Confirmed',
  127. 'aggregate': 'sum'
  128. },
  129. Recovered={
  130. 'name': 'Recovered',
  131. 'aggregate': 'sum'
  132. },
  133. Deaths={
  134. 'name': 'Deaths',
  135. 'aggregate': 'sum'
  136. }
  137. )
  138. ),
  139. update_schema('worldwide-aggregated', missingValues=['None', None, ''], fields=[
  140. {
  141. "format": "%Y-%m-%d",
  142. "name": "Date",
  143. "type": "date"
  144. },
  145. {
  146. "format": "default",
  147. "groupChar": "",
  148. "name": "Confirmed",
  149. "title": "Cumulative total confirmed cases to date",
  150. "type": "integer"
  151. },
  152. {
  153. "format": "default",
  154. "groupChar": "",
  155. "name": "Recovered",
  156. "title": "Cumulative total recovered cases to date",
  157. "type": "integer"
  158. },
  159. {
  160. "format": "default",
  161. "groupChar": "",
  162. "name": "Deaths",
  163. "title": "Cumulative total deaths to date",
  164. "type": "integer"
  165. }
  166. ]),
  167. checkpoint('processed_worldwide_data'),
  168. # Create another resource with countries aggregated
  169. duplicate(
  170. source='time-series-19-covid-combined',
  171. target_name='countries-aggregated',
  172. target_path='data/countries-aggregated.csv'
  173. ),
  174. join_with_self(
  175. resource_name='countries-aggregated',
  176. join_key=['Date', 'Country/Region'],
  177. fields=dict(
  178. Date={
  179. 'name': 'Date'
  180. },
  181. Country={
  182. 'name': 'Country/Region'
  183. },
  184. Confirmed={
  185. 'name': 'Confirmed',
  186. 'aggregate': 'sum'
  187. },
  188. Recovered={
  189. 'name': 'Recovered',
  190. 'aggregate': 'sum'
  191. },
  192. Deaths={
  193. 'name': 'Deaths',
  194. 'aggregate': 'sum'
  195. }
  196. )
  197. ),
  198. update_schema('countries-aggregated', missingValues=['None', None, ''], fields=[
  199. {
  200. "format": "%Y-%m-%d",
  201. "name": "Date",
  202. "type": "date"
  203. },
  204. {
  205. "format": "default",
  206. "name": "Country",
  207. "type": "string"
  208. },
  209. {
  210. "format": "default",
  211. "groupChar": "",
  212. "name": "Confirmed",
  213. "title": "Cumulative total confirmed cases to date",
  214. "type": "integer"
  215. },
  216. {
  217. "format": "default",
  218. "groupChar": "",
  219. "name": "Recovered",
  220. "title": "Cumulative total recovered cases to date",
  221. "type": "integer"
  222. },
  223. {
  224. "format": "default",
  225. "groupChar": "",
  226. "name": "Deaths",
  227. "title": "Cumulative total deaths to date",
  228. "type": "integer"
  229. }
  230. ]),
  231. checkpoint('processed_country_data'),
  232. # Prepare data package (name, title) and add views
  233. update_package(
  234. name='covid-19',
  235. title='Novel Coronavirus 2019',
  236. views=[
  237. {
  238. "title": "Total world to date",
  239. "resources": ["worldwide-aggregated"],
  240. "specType": "simple",
  241. "spec": {
  242. "group": "Date",
  243. "series": ["Confirmed", "Recovered", "Deaths"],
  244. "type": "line"
  245. }
  246. }
  247. ]
  248. ),
  249. dump_to_path()
  250. ).results()[0]