# Import pandas for data wrangling and Altair for plotting
import pandas as pd
import altair as alt
df_tot_sc = pd.read_excel('https://epistat.sciensano.be/Data/COVID19BE.xlsx')
df_inhab = pd.read_excel('https://statbel.fgov.be/sites/default/files/files/opendata/bevolking%20naar%20woonplaats%2C%20nationaliteit%20burgelijke%20staat%20%2C%20leeftijd%20en%20geslacht/TF_SOC_POP_STRUCT_2019.xlsx')
df_inhab
CD_REFNIS TX_DESCR_NL TX_DESCR_FR CD_DSTR_REFNIS TX_ADM_DSTR_DESCR_NL TX_ADM_DSTR_DESCR_FR CD_PROV_REFNIS TX_PROV_DESCR_NL TX_PROV_DESCR_FR CD_RGN_REFNIS TX_RGN_DESCR_NL TX_RGN_DESCR_FR CD_SEX CD_NATLTY TX_NATLTY_NL TX_NATLTY_FR CD_CIV_STS TX_CIV_STS_NL TX_CIV_STS_FR CD_AGE MS_POPULATION
0 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande F BEL Belgen Belges 4 Gescheiden Divorcé 69 11
1 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande F BEL Belgen Belges 4 Gescheiden Divorcé 80 3
2 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande M BEL Belgen Belges 4 Gescheiden Divorcé 30 2
3 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande F BEL Belgen Belges 4 Gescheiden Divorcé 48 26
4 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande F BEL Belgen Belges 4 Gescheiden Divorcé 76 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
463376 93090 Viroinval Viroinval 93000 Arrondissement Philippeville Arrondissement de Philippeville 90000.0 Provincie Namen Province de Namur 3000 Waals Gewest Région wallonne F BEL Belgen Belges 3 Weduwstaat Veuf 73 10
463377 93090 Viroinval Viroinval 93000 Arrondissement Philippeville Arrondissement de Philippeville 90000.0 Provincie Namen Province de Namur 3000 Waals Gewest Région wallonne M BEL Belgen Belges 3 Weduwstaat Veuf 64 1
463378 93090 Viroinval Viroinval 93000 Arrondissement Philippeville Arrondissement de Philippeville 90000.0 Provincie Namen Province de Namur 3000 Waals Gewest Région wallonne M BEL Belgen Belges 3 Weduwstaat Veuf 86 3
463379 93090 Viroinval Viroinval 93000 Arrondissement Philippeville Arrondissement de Philippeville 90000.0 Provincie Namen Province de Namur 3000 Waals Gewest Région wallonne M ETR niet-Belgen non-Belges 3 Weduwstaat Veuf 74 1
463380 93090 Viroinval Viroinval 93000 Arrondissement Philippeville Arrondissement de Philippeville 90000.0 Provincie Namen Province de Namur 3000 Waals Gewest Région wallonne M BEL Belgen Belges 3 Weduwstaat Veuf 52 1

463381 rows × 21 columns

inhab_provence = df_inhab['TX_PROV_DESCR_NL'].dropna().unique()
inhab_provence
array(['Provincie Antwerpen', 'Provincie Vlaams-Brabant',
       'Provincie Waals-Brabant', 'Provincie West-Vlaanderen',
       'Provincie Oost-Vlaanderen', 'Provincie Henegouwen',
       'Provincie Luik', 'Provincie Limburg', 'Provincie Luxemburg',
       'Provincie Namen'], dtype=object)
sc_provence = df_tot_sc['PROVINCE'].unique()
sc_provence
array(['Brussels', 'Liège', 'Limburg', 'OostVlaanderen', 'VlaamsBrabant',
       'Antwerpen', 'WestVlaanderen', 'BrabantWallon', 'Hainaut', 'Namur',
       nan, 'Luxembourg'], dtype=object)
[p.split()[1] for p in inhab_provence]
['Antwerpen',
 'Vlaams-Brabant',
 'Waals-Brabant',
 'West-Vlaanderen',
 'Oost-Vlaanderen',
 'Henegouwen',
 'Luik',
 'Limburg',
 'Luxemburg',
 'Namen']
map_statbel_provence_to_sc_provence = {'Provincie Antwerpen':'Antwerpen', 'Provincie Vlaams-Brabant':'VlaamsBrabant',
       'Provincie Waals-Brabant':'BrabantWallon', 'Provincie West-Vlaanderen':'WestVlaanderen',
       'Provincie Oost-Vlaanderen':'OostVlaanderen', 'Provincie Henegouwen':'Hainaut',
       'Provincie Luik':'Liège', 'Provincie Limburg':'Limburg', 'Provincie Luxemburg':'Luxembourg',
       'Provincie Namen':'Namur'}
df_inhab['sc_provence'] = df_inhab['TX_PROV_DESCR_NL'].map(map_statbel_provence_to_sc_provence)
df_tot_sc['AGEGROUP'].unique()
array(['10-19', '20-29', '30-39', '40-49', '50-59', '70-79', '60-69',
       '0-9', '90+', '80-89', nan], dtype=object)
df_inhab['AGEGROUP'] =pd.cut(df_inhab['CD_AGE'], bins=[0,10,20,30,40,50,60,70,80,90,200], labels=['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90+'], include_lowest=True)
df_inhab_gender_prov = df_inhab.groupby(['sc_provence', 'CD_SEX', 'AGEGROUP'])['MS_POPULATION'].sum().reset_index()
df_inhab_gender_prov_cases = pd.merge(df_inhab_gender_prov, df_tot_sc.dropna(), left_on=['sc_provence', 'AGEGROUP', 'CD_SEX'], right_on=['PROVINCE', 'AGEGROUP', 'SEX'])
df_inhab_gender_prov_cases.head()
sc_provence CD_SEX AGEGROUP MS_POPULATION DATE PROVINCE REGION SEX CASES
0 Antwerpen F 0-9 113851 2020-03-05 Antwerpen Flanders F 1
1 Antwerpen F 0-9 113851 2020-03-18 Antwerpen Flanders F 1
2 Antwerpen F 0-9 113851 2020-03-26 Antwerpen Flanders F 1
3 Antwerpen F 0-9 113851 2020-03-30 Antwerpen Flanders F 1
4 Antwerpen F 0-9 113851 2020-04-03 Antwerpen Flanders F 1
df_plot = df_inhab_gender_prov_cases.groupby(['SEX', 'AGEGROUP', 'PROVINCE']).agg(CASES = ('CASES', 'sum'), MS_POPULATION=('MS_POPULATION', 'first')).reset_index()
df_plot
SEX AGEGROUP PROVINCE CASES MS_POPULATION
0 F 0-9 Antwerpen 9 113851
1 F 0-9 BrabantWallon 3 23744
2 F 0-9 Hainaut 11 81075
3 F 0-9 Limburg 11 48102
4 F 0-9 Liège 19 67479
... ... ... ... ... ...
195 M 90+ Luxembourg 17 469
196 M 90+ Namur 27 827
197 M 90+ OostVlaanderen 102 3105
198 M 90+ VlaamsBrabant 129 2611
199 M 90+ WestVlaanderen 121 3292

200 rows × 5 columns

df_plot['PROVINCE'].unique()
array(['Antwerpen', 'BrabantWallon', 'Hainaut', 'Limburg', 'Liège',
       'Luxembourg', 'Namur', 'OostVlaanderen', 'VlaamsBrabant',
       'WestVlaanderen'], dtype=object)
alt.Chart(df_plot).mark_bar().encode(x='AGEGROUP:N', y='CASES', color='SEX:N', column='PROVINCE:N')
df_plot['percentage'] = df_plot['CASES'] / df_plot['MS_POPULATION']
alt.Chart(df_plot).mark_bar().encode(x='AGEGROUP:N', y='percentage', color='SEX:N', column='PROVINCE:N')

Let's add a colorscale the makes the male blue and female number pink.

color_scale = alt.Scale(domain=['M', 'F'],
                        range=['#1f77b4', '#e377c2'])
alt.Chart(df_plot).mark_bar().encode(
    x='AGEGROUP:N', 
    y='percentage', 
    color=alt.Color('SEX:N', scale=color_scale, legend=None),
    column='PROVINCE:N')

The graph's get to wide. Let's use faceting to make two rows.

Inspired and based on https://altair-viz.github.io/gallery/us_population_pyramid_over_time.html

#slider = alt.binding_range(min=1850, max=2000, step=10)
# select_province = alt.selection_single(name='PROVINCE', fields=['PROVINCE'],
#                                    bind=slider, init={'PROVINCE': 'Antwerpen'})
color_scale = alt.Scale(domain=['Male', 'Female'],
                        range=['#1f77b4', '#e377c2'])

select_province = alt.selection_multi(fields=['PROVINCE'], bind='legend')

base = alt.Chart(df_plot).add_selection(
    select_province
).transform_filter(
    select_province
).transform_calculate(
    gender=alt.expr.if_(alt.datum.SEX == 'M', 'Male', 'Female')
).properties(
    width=250
)

left = base.transform_filter(
    alt.datum.gender == 'Female'
).encode(
    y=alt.Y('AGEGROUP:O', axis=None),
    x=alt.X('percentage:Q', axis=alt.Axis(format='.0%'),
            title='Percentage',
            sort=alt.SortOrder('descending'),
            ),
    color=alt.Color('gender:N', scale=color_scale, legend=None),
).mark_bar().properties(title='Female')

middle = base.encode(
    y=alt.Y('AGEGROUP:O', axis=None),
    text=alt.Text('AGEGROUP:O'),
).mark_text().properties(width=20)

right = base.transform_filter(
    alt.datum.gender == 'Male'
).encode(
    y=alt.Y('AGEGROUP:O', axis=None),
    x=alt.X('percentage:Q', title='Percentage', axis=alt.Axis(format='.0%'),),
    color=alt.Color('gender:N', scale=color_scale, legend=None)
).mark_bar().properties(title='Male')

# legend = alt.Chart(df_plot).mark_text().encode(
#     y=alt.Y('PROVINCE:N', axis=None),
#     text=alt.Text('PROVINCE:N'),
#     color=alt.Color('PROVINCE:N', legend=alt.Legend(title="Provincie"))
# )

alt.concat(left, middle, right, spacing=5)

#legend=alt.Legend(title="Species by color")
provinces = df_plot['PROVINCE'].unique()
select_province = alt.selection_single(
    name='Select', # name the selection 'Select'
    fields=['PROVINCE'], # limit selection to the Major_Genre field
    init={'PROVINCE': 'Antwerpen'}, # use first genre entry as initial value
    bind=alt.binding_select(options=provinces) # bind to a menu of unique provence values
)


base = alt.Chart(df_plot).add_selection(
    select_province
).transform_filter(
    select_province
).transform_calculate(
    gender=alt.expr.if_(alt.datum.SEX == 'M', 'Male', 'Female')
).properties(
    width=250
)

left = base.transform_filter(
    alt.datum.gender == 'Female'
).encode(
    y=alt.Y('AGEGROUP:O', axis=None),
    x=alt.X('percentage:Q', axis=alt.Axis(format='.0%'),
            title='Percentage',
            sort=alt.SortOrder('descending'),
            scale=alt.Scale(domain=(0.0, 0.1), clamp=True)
            ),
    color=alt.Color('gender:N', scale=color_scale, legend=None),
    tooltip=[alt.Tooltip('percentage', format='.1%')]
).mark_bar().properties(title='Female')

middle = base.encode(
    y=alt.Y('AGEGROUP:O', axis=None),
    text=alt.Text('AGEGROUP:O'),
).mark_text().properties(width=20)

right = base.transform_filter(
    alt.datum.gender == 'Male'
).encode(
    y=alt.Y('AGEGROUP:O', axis=None),
    x=alt.X('percentage:Q', title='Percentage', axis=alt.Axis(format='.1%'), scale=alt.Scale(domain=(0.0, 0.1), clamp=True)),
    color=alt.Color('gender:N', scale=color_scale, legend=None),
    tooltip=[alt.Tooltip('percentage', format='.1%')]
).mark_bar().properties(title='Male')

alt.concat(left, middle, right, spacing=5).properties(title='Percentage of covid-19 cases per province, gender and age grup in Belgium')

Mortality

# https://epistat.wiv-isp.be/covid/
# Dataset of mortality by date, age, sex, and region
df_dead_sc = pd.read_csv('https://epistat.sciensano.be/Data/COVID19BE_MORT.csv')
df_dead_sc.head()
DATE REGION AGEGROUP SEX DEATHS
0 2020-03-10 Brussels 85+ F 1
1 2020-03-11 Flanders 85+ F 1
2 2020-03-11 Brussels 75-84 M 1
3 2020-03-11 Brussels 85+ F 1
4 2020-03-12 Brussels 75-84 M 1
df_dead_sc['REGION'].value_counts()
Wallonia    291
Flanders    275
Brussels    271
Name: REGION, dtype: int64
df_dead_sc['AGEGROUP'].value_counts()
85+      223
75-84    205
65-74    179
45-64    132
25-44     19
0-24       1
Name: AGEGROUP, dtype: int64
df_inhab['AGEGROUP_sc'] =pd.cut(df_inhab['CD_AGE'], bins=[0,24,44,64,74,84,200], labels=['0-24','25-44','45-64','65-74','75-84','85+'], include_lowest=True)
df_inhab.groupby('AGEGROUP_sc').agg(lowest_age=('CD_AGE', 'min'), highest_age=('CD_AGE', max))
lowest_age highest_age
AGEGROUP_sc
0-24 0 24
25-44 25 44
45-64 45 64
65-74 65 74
75-84 75 84
85+ 85 110
df_inhab.head()
CD_REFNIS TX_DESCR_NL TX_DESCR_FR CD_DSTR_REFNIS TX_ADM_DSTR_DESCR_NL TX_ADM_DSTR_DESCR_FR CD_PROV_REFNIS TX_PROV_DESCR_NL TX_PROV_DESCR_FR CD_RGN_REFNIS TX_RGN_DESCR_NL TX_RGN_DESCR_FR CD_SEX CD_NATLTY TX_NATLTY_NL TX_NATLTY_FR CD_CIV_STS TX_CIV_STS_NL TX_CIV_STS_FR CD_AGE MS_POPULATION sc_provence AGEGROUP AGEGROUP_sc
0 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande F BEL Belgen Belges 4 Gescheiden Divorcé 69 11 Antwerpen 60-69 65-74
1 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande F BEL Belgen Belges 4 Gescheiden Divorcé 80 3 Antwerpen 70-79 75-84
2 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande M BEL Belgen Belges 4 Gescheiden Divorcé 30 2 Antwerpen 20-29 25-44
3 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande F BEL Belgen Belges 4 Gescheiden Divorcé 48 26 Antwerpen 40-49 45-64
4 11001 Aartselaar Aartselaar 11000 Arrondissement Antwerpen Arrondissement d’Anvers 10000.0 Provincie Antwerpen Province d’Anvers 2000 Vlaams Gewest Région flamande F BEL Belgen Belges 4 Gescheiden Divorcé 76 2 Antwerpen 70-79 75-84
df_dead_sc['REGION'].unique()
array(['Brussels', 'Flanders', 'Wallonia'], dtype=object)
df_inhab['TX_RGN_DESCR_NL'].value_counts()
Vlaams Gewest                     242865
Waals Gewest                      199003
Brussels Hoofdstedelijk Gewest     21513
Name: TX_RGN_DESCR_NL, dtype: int64
df_inhab_gender_prov = df_inhab.groupby(['TX_RGN_DESCR_NL', 'CD_SEX', 'AGEGROUP_sc'])['MS_POPULATION'].sum().reset_index()
region_sc_to_region_inhad = {'Flanders':'Vlaams Gewest', 'Wallonia':'Waals Gewest', 'Brussels':'Brussels Hoofdstedelijk Gewest'}
df_dead_sc['TX_RGN_DESCR_NL'] = df_dead_sc['REGION'].map(region_sc_to_region_inhad)
df_dead_sc.groupby(['TX_RGN_DESCR_NL', 'AGEGROUP', 'SEX'])['DEATHS'].sum()
TX_RGN_DESCR_NL                 AGEGROUP  SEX
Brussels Hoofdstedelijk Gewest  25-44     F        1
                                          M        4
                                45-64     F       21
                                          M       43
                                65-74     F       42
                                          M       71
                                75-84     F      128
                                          M      170
                                85+       F      270
                                          M      186
Vlaams Gewest                   0-24      F        1
                                25-44     F        2
                                          M        3
                                45-64     F       27
                                          M       63
                                65-74     F       67
                                          M      130
                                75-84     F      199
                                          M      335
                                85+       F      232
                                          M      309
Waals Gewest                    25-44     F        5
                                          M        4
                                45-64     F       41
                                          M       89
                                65-74     F       98
                                          M      186
                                75-84     F      290
                                          M      300
                                85+       F      704
                                          M      421
Name: DEATHS, dtype: int64
df_dead_sc_region_agegroup_gender = df_dead_sc.groupby(['TX_RGN_DESCR_NL', 'AGEGROUP', 'SEX'])['DEATHS'].sum().reset_index()
df_inhab_gender_prov_deaths = pd.merge(df_inhab_gender_prov, df_dead_sc_region_agegroup_gender, left_on=['TX_RGN_DESCR_NL', 'AGEGROUP_sc', 'CD_SEX'], right_on=['TX_RGN_DESCR_NL', 'AGEGROUP', 'SEX'])
df_inhab_gender_prov_deaths['MS_POPULATION'].sum()
9077403
df_inhab_gender_prov_deaths['DEATHS'].sum()
4442
df_inhab_gender_prov_deaths
TX_RGN_DESCR_NL CD_SEX AGEGROUP_sc MS_POPULATION AGEGROUP SEX DEATHS
0 Brussels Hoofdstedelijk Gewest F 25-44 197579 25-44 F 1
1 Brussels Hoofdstedelijk Gewest F 45-64 137628 45-64 F 21
2 Brussels Hoofdstedelijk Gewest F 65-74 45214 65-74 F 42
3 Brussels Hoofdstedelijk Gewest F 75-84 30059 75-84 F 128
4 Brussels Hoofdstedelijk Gewest F 85+ 18811 85+ F 270
5 Brussels Hoofdstedelijk Gewest M 25-44 194988 25-44 M 4
6 Brussels Hoofdstedelijk Gewest M 45-64 140348 45-64 M 43
7 Brussels Hoofdstedelijk Gewest M 65-74 36698 65-74 M 71
8 Brussels Hoofdstedelijk Gewest M 75-84 19969 75-84 M 170
9 Brussels Hoofdstedelijk Gewest M 85+ 7918 85+ M 186
10 Vlaams Gewest F 0-24 874891 0-24 F 1
11 Vlaams Gewest F 25-44 820036 25-44 F 2
12 Vlaams Gewest F 45-64 901554 45-64 F 27
13 Vlaams Gewest F 65-74 353925 65-74 F 67
14 Vlaams Gewest F 75-84 245981 75-84 F 199
15 Vlaams Gewest F 85+ 132649 85+ F 232
16 Vlaams Gewest M 25-44 827281 25-44 M 3
17 Vlaams Gewest M 45-64 917008 45-64 M 63
18 Vlaams Gewest M 65-74 336242 65-74 M 130
19 Vlaams Gewest M 75-84 193576 75-84 M 335
20 Vlaams Gewest M 85+ 69678 85+ M 309
21 Waals Gewest F 25-44 457356 25-44 F 5
22 Waals Gewest F 45-64 496668 45-64 F 41
23 Waals Gewest F 65-74 199422 65-74 F 98
24 Waals Gewest F 75-84 118224 75-84 F 290
25 Waals Gewest F 85+ 68502 85+ F 704
26 Waals Gewest M 25-44 459444 25-44 M 4
27 Waals Gewest M 45-64 487322 45-64 M 89
28 Waals Gewest M 65-74 175508 65-74 M 186
29 Waals Gewest M 75-84 82876 75-84 M 300
30 Waals Gewest M 85+ 30048 85+ M 421
df_inhab_gender_prov_deaths['percentage'] = df_inhab_gender_prov_deaths['DEATHS']/df_inhab_gender_prov_deaths['MS_POPULATION']
df_plot = df_inhab_gender_prov_deaths
regions = df_plot['TX_RGN_DESCR_NL'].unique()
select_province = alt.selection_single(
    name='Select', # name the selection 'Select'
    fields=['TX_RGN_DESCR_NL'], # limit selection to the Major_Genre field
    init={'TX_RGN_DESCR_NL': 'Vlaams Gewest'}, # use first genre entry as initial value
    bind=alt.binding_select(options=regions) # bind to a menu of unique provence values
)

base = alt.Chart(df_plot).add_selection(
    select_province
).transform_filter(
    select_province
).transform_calculate(
    gender=alt.expr.if_(alt.datum.SEX == 'M', 'Male', 'Female')
).properties(
    width=250
)

left = base.transform_filter(
    alt.datum.gender == 'Female'
).encode(
    y=alt.Y('AGEGROUP:O', axis=None),
    x=alt.X('percentage:Q', axis=alt.Axis(format='.2%'),
            title='Percentage',
            sort=alt.SortOrder('descending'),
            # scale=alt.Scale(domain=(0.0, 0.02), clamp=True)
            ),
    color=alt.Color('gender:N', scale=color_scale, legend=None),
    tooltip=[alt.Tooltip('percentage', format='.2%')]
).mark_bar().properties(title='Female')

middle = base.encode(
    y=alt.Y('AGEGROUP:O', axis=None),
    text=alt.Text('AGEGROUP:O'),
).mark_text().properties(width=20)

right = base.transform_filter(
    alt.datum.gender == 'Male'
).encode(
    y=alt.Y('AGEGROUP:O', axis=None),
    # x=alt.X('percentage:Q', title='Percentage', axis=alt.Axis(format='.2%'), scale=alt.Scale(domain=(0.0, 0.02), clamp=True)),
    x=alt.X('percentage:Q', title='Percentage', axis=alt.Axis(format='.2%')),
    color=alt.Color('gender:N', scale=color_scale, legend=None),
    tooltip=[alt.Tooltip('percentage', format='.2%')]
).mark_bar().properties(title='Male')

alt.concat(left, middle, right, spacing=5).properties(title='Percentage of covid-19 deaths per province, gender and age group relative to number of inhabitants in Belgium')