You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
2.3 KiB

  1. from dataflows import Flow, load, unpivot, find_replace, set_type, dump_to_path, update_resource, join, add_computed_field, delete_fields
  2. BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
  3. CONFIRMED = 'time_series_19-covid-Confirmed.csv'
  4. DEATH = 'time_series_19-covid-Deaths.csv'
  5. RECOVERED = 'time_series_19-covid-Recovered.csv'
  6. def to_normal_date(row):
  7. old_date = row['Date']
  8. month, day, year = row['Date'].split('-')
  9. day = f'0{day}' if len(day) == 1 else day
  10. month = f'0{month}' if len(month) == 1 else month
  11. row['Date'] = '-'.join([day, month, year])
  12. unpivoting_fields = [
  13. { 'name': '([0-9]+\/[0-9]+\/[0-9]+)', 'keys': {'Date': r'\1'} }
  14. ]
  15. extra_keys = [{'name': 'Date', 'type': 'string'} ]
  16. extra_value = {'name': 'Case', 'type': 'number'}
  17. Flow(
  18. load(f'{BASE_URL}{CONFIRMED}'),
  19. load(f'{BASE_URL}{RECOVERED}'),
  20. load(f'{BASE_URL}{DEATH}'),
  21. unpivot(unpivoting_fields, extra_keys, extra_value),
  22. find_replace([{'name': 'Date', 'patterns': [{'find': '/', 'replace': '-'}]}]),
  23. to_normal_date,
  24. set_type('Date', type='date', format='%d-%m-%y', resources=None),
  25. set_type('Case', type='number', resources=None),
  26. join(
  27. source_name='time_series_19-covid-Confirmed',
  28. source_key=['Province/State', 'Country/Region', 'Date'],
  29. source_delete=True,
  30. target_name='time_series_19-covid-Deaths',
  31. target_key=['Province/State', 'Country/Region', 'Date'],
  32. fields=dict(Confirmed={
  33. 'name': 'Case',
  34. 'aggregate': 'first'
  35. })
  36. ),
  37. join(
  38. source_name='time_series_19-covid-Recovered',
  39. source_key=['Province/State', 'Country/Region', 'Date'],
  40. source_delete=True,
  41. target_name='time_series_19-covid-Deaths',
  42. target_key=['Province/State', 'Country/Region', 'Date'],
  43. fields=dict(Recovered={
  44. 'name': 'Case',
  45. 'aggregate': 'first'
  46. })
  47. ),
  48. add_computed_field(
  49. target={'name': 'Deaths', 'type': 'number'},
  50. operation='format',
  51. with_='{Case}'
  52. ),
  53. delete_fields(['Case']),
  54. update_resource('time_series_19-covid-Deaths', name='time-series-19-covid-combined', path='time-series-19-covid-combined.csv'),
  55. dump_to_path()
  56. ).results()[0]