%reset -f
%who
import numpy as np
import pandas as pd
covid_data = pd.read_csv('https://data.humdata.org/hxlproxy/data/download/time_series_covid19_confirmed_global_narrow.csv?dest=data_edit&filter01=merge&merge-url01=https%3A%2F%2Fdocs.google.com%2Fspreadsheets%2Fd%2Fe%2F2PACX-1vTglKQRXpkKSErDiWG6ycqEth32MY0reMuVGhaslImLjfuLU0EUgyyu2e-3vKDArjqGX7dXEBV8FJ4f%2Fpub%3Fgid%3D1326629740%26single%3Dtrue%26output%3Dcsv&merge-keys01=%23country%2Bname&merge-tags01=%23country%2Bcode%2C%23region%2Bmain%2Bcode%2C%23region%2Bsub%2Bcode%2C%23region%2Bintermediate%2Bcode&filter02=merge&merge-url02=https%3A%2F%2Fdocs.google.com%2Fspreadsheets%2Fd%2Fe%2F2PACX-1vTglKQRXpkKSErDiWG6ycqEth32MY0reMuVGhaslImLjfuLU0EUgyyu2e-3vKDArjqGX7dXEBV8FJ4f%2Fpub%3Fgid%3D398158223%26single%3Dtrue%26output%3Dcsv&merge-keys02=%23adm1%2Bname&merge-tags02=%23country%2Bcode%2C%23region%2Bmain%2Bcode%2C%23region%2Bsub%2Bcode%2C%23region%2Bintermediate%2Bcode&merge-replace02=on&merge-overwrite02=on&filter03=explode&explode-header-att03=date&explode-value-att03=value&filter04=rename&rename-oldtag04=%23affected%2Bdate&rename-newtag04=%23date&rename-header04=Date&filter05=rename&rename-oldtag05=%23affected%2Bvalue&rename-newtag05=%23affected%2Binfected%2Bvalue%2Bnum&rename-header05=Value&filter06=clean&clean-date-tags06=%23date&filter07=sort&sort-tags07=%23date&sort-reverse07=on&filter08=sort&sort-tags08=%23country%2Bname%2C%23adm1%2Bname&tagger-match-all=on&tagger-default-tag=%23affected%2Blabel&tagger-01-header=province%2Fstate&tagger-01-tag=%23adm1%2Bname&tagger-02-header=country%2Fregion&tagger-02-tag=%23country%2Bname&tagger-03-header=lat&tagger-03-tag=%23geo%2Blat&tagger-04-header=long&tagger-04-tag=%23geo%2Blon&header-row=1&url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv')
covid_data.info()
covid_data
We are also reseting the row index
covid_data.drop(index = 0, inplace = True)
covid_data.reset_index(drop = True, inplace = True)
covid_data
covid_data.columns = ['state', 'country', 'latitude', 'longitude', 'date', 'total_cases',
'iso_country_code', 'region_code', 'subregion_code', 'intermediate_region_code']
covid_data
covid_data.drop(columns = ['latitude', 'longitude'], inplace = True)
covid_data
covid_data.info()
We will convert numeric fields to floats then ints (because we have to for some reason)
covid_data = covid_data.astype({'date' : 'datetime64',
'total_cases' : 'float64'})
covid_data = covid_data.astype({'total_cases' : 'int64'})
covid_data.info()
covid_data['state'].unique()
Making a list of states that we want to consider countries
country_states = ['Diamond Princess', 'Grand Princess', 'Hong Kong',
'Faroe Islands', 'Greenland', 'French Guiana',
'French Polynesia', 'Guadeloupe', 'Martinique',
'Mayotte', 'New Caledonia', 'Reunion', 'Saint Barthelemy',
'Saint Pierre and Miquelon', 'St Martin', 'Aruba',
'Bonaire, Sint Eustatius and Saba', 'Curacao', 'Sint Maarten',
'Anguilla', 'Bermuda', 'British Virgin Islands', 'Cayman Islands',
'Channel Islands', 'Falkland Islands (Malvinas)', 'Gibraltar',
'Isle of Man', 'Montserrat', 'Turks and Caicos Islands']
Converting states to countries, when they are in country_states
covid_data.loc[covid_data['state'].isin(country_states), 'country'] = covid_data.loc[covid_data['state'].isin(country_states), 'state'].copy()
Checking our work
covid_data['country'][covid_data['country'].isin(country_states)].unique()
pd.Series(country_states).to_numpy()
del(country_states)
Removing state column
covid_data.drop(columns = 'state', inplace = True)
covid_data
And removing duplicates to get country-level data
country_data = covid_data[['country', 'iso_country_code', 'region_code', 'subregion_code', 'intermediate_region_code']].copy()
country_data.drop_duplicates(subset = 'country', inplace = True)
country_data.info()
country_data
covid_data
¶covid_data.drop(columns = ['iso_country_code', 'region_code',
'subregion_code', 'intermediate_region_code'],
inplace = True)
covid_data
Combining (summing) rows that are from the same countries on the same day (summing over states)
Then sorting data by country then date
covid_data = covid_data.groupby(['country','date'])['total_cases'].sum().reset_index()
covid_data.sort_values(by = ['country','date'], inplace = True)
covid_data
daily_cases
Column¶covid_data['yesterday_total_cases'] = covid_data['total_cases'].shift(1).copy()
covid_data['daily_cases'] = covid_data['total_cases'] - covid_data['yesterday_total_cases']
covid_data.drop(columns = 'yesterday_total_cases', inplace = True)
covid_data.loc[covid_data['date'] == '2020-01-22T00:00:00.000000000', 'daily_cases'] = covid_data.loc[
covid_data['date'] == '2020-01-22T00:00:00.000000000', 'total_cases']
covid_data = covid_data.astype({'daily_cases' : 'int64'})
covid_data
covid_data
¶country_data['region'] = np.NaN
country_data.loc[country_data['subregion_code'] == '143', 'region'] = 'Central Asia'
country_data.loc[country_data['subregion_code'] == '145', 'region'] = 'Western Asia'
country_data.loc[country_data['subregion_code'] == '15', 'region'] = 'North Africa'
country_data.loc[country_data['subregion_code'] == '151', 'region'] = 'Eastern Europe'
country_data.loc[country_data['subregion_code'] == '154', 'region'] = 'Northern Europe'
country_data.loc[country_data['subregion_code'] == '155', 'region'] = 'Western Europe'
country_data.loc[country_data['subregion_code'] == '202', 'region'] = 'Non-Northern Africa'
country_data.loc[country_data['subregion_code'] == '21', 'region'] = 'Northern America'
country_data.loc[country_data['subregion_code'] == '30', 'region'] = 'Eastern Asia'
country_data.loc[country_data['subregion_code'] == '34', 'region'] = 'Southern Asia'
country_data.loc[country_data['subregion_code'] == '35', 'region'] = 'Southeastern Asia'
country_data.loc[country_data['subregion_code'] == '39', 'region'] = 'Southern Europe'
country_data.loc[country_data['subregion_code'] == '419', 'region'] = 'Central and South America'
country_data.loc[country_data['subregion_code'].isin(['53', '54', '61']), 'region'] = 'Oceania'
covid_data = covid_data.merge(country_data[['country', 'region']], how = 'left')
covid_data
These continents divide countries slightly differently than the actual continents do
covid_data['continent'] = covid_data['region'].copy()
covid_data.loc[covid_data['continent'].str.contains(r'Africa$').fillna(False), 'continent'] = 'Africa'
covid_data.loc[covid_data['continent'].str.contains(r'Asia$').fillna(False), 'continent'] = 'Asia'
covid_data.loc[covid_data['continent'].str.contains(r'Europe$').fillna(False), 'continent'] = 'Europe'
covid_data = covid_data[['continent', 'region', 'country', 'date', 'total_cases', 'daily_cases']]
print(covid_data['continent'].unique())
covid_data
Creating Eurasia data from Asia and Europe data
asia_data = covid_data.loc[covid_data['continent'] == 'Asia',]
europe_data = covid_data.loc[covid_data['continent'] == 'Europe',]
eurasia_data = pd.concat([asia_data , europe_data], ignore_index = True)
eurasia_data
del(asia_data, eurasia_data, europe_data)
Adding in ISO Country Codes to Covid Data
covid_data = pd.merge(covid_data, country_data[['country', 'iso_country_code']], how = 'left', on = 'country')
covid_data = covid_data[['continent', 'region', 'country', 'iso_country_code', 'date', 'total_cases', 'daily_cases']]
covid_data
covid_wide_data = covid_data
covid_wide_data = covid_wide_data.astype({'date' : 'str'})
covid_wide_data = covid_wide_data.pivot(index = 'country', columns = 'date', values = ['total_cases', 'daily_cases'])
covid_wide_data.columns = ['_'.join(column).strip() for column in covid_wide_data.columns.values]
covid_wide_data.index.name = None
covid_wide_data
covid_long_data = covid_wide_data.stack()
covid_long_data = covid_long_data.to_frame()
covid_long_data.reset_index(inplace = True)
covid_long_data.columns = ['country', 'case_type_and_date', 'cases']
covid_long_data['case_type'] = ''
covid_long_data.loc[covid_long_data['case_type_and_date'].str.contains(r'^total_cases_') ,'case_type'] = 'total'
covid_long_data.loc[covid_long_data['case_type_and_date'].str.contains(r'^daily_cases_') ,'case_type'] = 'daily'
covid_long_data['case_type_and_date'] = covid_long_data['case_type_and_date'].str[12:]
covid_long_data.columns = ['country', 'date', 'cases', 'case_type']
covid_long_data = covid_long_data[['country', 'date', 'case_type', 'cases']]
covid_long_data
del(covid_wide_data, covid_long_data)
covid_data.info()
covid_data.dropna(axis = 'index', how = 'any', inplace = True)
covid_data.info()
aggregate_data = covid_data.loc[covid_data['date'] == covid_data['date'].max(), ['region', 'total_cases']]
aggregate_data = aggregate_data.groupby('region').sum()
aggregate_data.index.name = None
aggregate_data
del(aggregate_data)
us_data = covid_data.loc[covid_data['country'] == 'US',]
us_data = us_data.loc[us_data['date'] >= '2020-03-01',]
us_data = us_data.reset_index()
us_data.info()
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(us_data['date'], us_data['daily_cases'])
plt.show()
del(us_data, plt, fig, ax)