import requests, zipfile, io
import tempfile as tf
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
Population by Mother Tongue and Geography - Open Canada
Data Provider
Statistics Canada’s Open Government is a free and open-access platform containing over 80,000 datasets across diverse subjects. The purpose of sharing all data documents with the public is to remain transparent and accessible.
Dataset can be discovered by multiple searching methods here, such as Browse by subject, Open Government Portal for direct keywords search, Open Maps which contains geospatial information data, Open Data Inventory from the government of Canada organization, Apps Gallery for representing those mobile and web-based application data, Open Data 101 for letting people know how to use dataset and so on.
Population by mother tongue and geography from 1951 to 2021
This census dataset tabulates the population and percentage by three different types of mother tongue (English, French, and non-official languages) from 1951 to 2021.
The dataset and its metadata file which contains detailed variable descriptions have been stored together as a zip file here. Alternatively, customized tables are available here.
Libraries
library(tidyverse)
library(gridExtra) #for grid.arrage() function
Data Acquisition
The following code is used to download and organize the original dataset, and separate it into two by groups of population and percentage.
# Download the zip file of population by mother tongue
= tf.TemporaryFile()
temp = "https://www150.statcan.gc.ca/n1/tbl/csv/15100031-eng.zip"
url = requests.get(url)
r = zipfile.ZipFile(io.BytesIO(r.content))
temp = temp.namelist()
file_list print(file_list)
['15100031.csv', '15100031_MetaData.csv']
# Reading the csv file by name
= pd.read_csv(temp.open('15100031.csv'))
tongue
# Organize the dataset and split it by measurement methods
= tongue.rename(columns = {"GEO" : "location",
tongue "REF_DATE" : "year"})
# filter to set equal allocation of multi-lingual to their individual categories
= tongue[(tongue["Multiple responses"] == "Distributed")]
tongue = tongue[["year",
tongue "location",
"Mother tongue",
"Statistics",
"VALUE"]]
= pd.pivot_table(tongue,
tongue = "VALUE",
values = ["year",
index "location",
"Mother tongue"],
= "Statistics").reset_index() columns
# Download the zip file of population by mother tongue
<- tempfile()
temp download.file("https://www150.statcan.gc.ca/n1/tbl/csv/15100031-eng.zip",temp)
<- as.character(unzip(temp, list = TRUE)$Name)) (file_list
[1] "15100031.csv" "15100031_MetaData.csv"
<- read_csv(unz(temp, "15100031.csv"))
tongue unlink(temp) # Delete temp file
# Organize the dataset and split it by measurement methods
= tongue |> rename_all(make.names) |>
tongue rename(year = REF_DATE, location = GEO)|>
# filter to set equal allocation of multi-lingual to their individual categories
filter(Multiple.responses == "Distributed")|>
select(year, location, Mother.tongue, Statistics, VALUE) |>
pivot_wider(names_from= Statistics, values_from=VALUE)
Percentage changes of different mother-tongue speakers in Canada
The following code plots count, percentage, and percent change in mother tongue speakers in Canada.
= tongue[(tongue["location"] == "Canada") &
canada "Mother tongue"] != "Total, mother tongue")]
(tongue[
"darkgrid")
sb.set_style(
= sb.lineplot(data = canada,
can_plot_1 = "year",
x = "Number",
y = "Mother tongue")
hue "Number of speakers of mother tongues in Canada")
can_plot_1.set_title(; plt.show()
plt.clf()= sb.lineplot(data = canada,
can_plot_2 = "year",
x = "Percentage",
y = "Mother tongue")
hue "Percentage of speakers of mother tongues in Canada")
can_plot_2.set_title(; plt.show()
# Suppress warning about chained indexing
= None
pd.options.mode.chained_assignment
"Percentage Change"] = canada.groupby("Mother tongue")["Percentage"].pct_change()*100
canada[
plt.clf()= sb.lineplot(data = canada,
can_plot_3 = "year",
x = "Percentage Change",
y = "Mother tongue")
hue "Percentage change over 4 years "
can_plot_3.set_title("for different mother tongues in Canada")
; plt.show()
|> filter(location == "Canada" & Mother.tongue != "Total, mother tongue") |>
tongue ggplot( aes(x = year, y = Number, group = Mother.tongue)) +
labs(title = "Number of speakers of mother tongues in Canada") +
geom_line(aes(color = Mother.tongue))
|> subset(location == "Canada" & Mother.tongue != "Total, mother tongue") |>
tongue ggplot( aes(x = year, y = Percentage, group = Mother.tongue)) +
labs(title = "Percentage of speakers of mother tongues in Canada") +
geom_line(aes(color = Mother.tongue))
|> subset(location == "Canada" & Mother.tongue != "Total, mother tongue") |>
tongue group_by(Mother.tongue) |>
mutate(pct_change = (Percentage/lag(Percentage) - 1) * 100) |>
na.omit() |>
ggplot( aes(x = year, y = pct_change, group = Mother.tongue)) +
labs(y = "Percentage Change", title = "Percentage change over 4 years for different mother tongues in Canada") +
geom_line(aes(color = Mother.tongue))
Summary
The upper-panel graph shows the population count of different language speakers in Canada. We can see the time series of all different mother-tongue speakers are increasing.
The second-panel graph shows the percentage of different mother tongue speakers in Canada. This plot shows the proportion of French speakers has decreased steadily, while the proportion of non-official language speakers has grown.
The third plot shows change in percentage between censuses. This plot is less insightful, other than to show how the code works. The plot shows that the change in percentage of French has been stable for decades at slightly negative percentage change. English is fairly steady though declining slightly.
Percentage of mother-tongue speakers in different provinces
The following code focuses on the percentages within provinces.
# Plot the French and non-official language speakers percents among all provinces in Canada
plt.clf()= sb.lineplot(data = (tongue[(tongue["location"] != "Canada") &
not_eng_plot_1 "Mother tongue"] == "French")]),
(tongue[= "year",
x = "Percentage",
y = "location")
hue "Percentage of French "
not_eng_plot_1.set_title("mother tongues in different provinces")
=(1.05, 1), loc='upper left', borderaxespad=0)
plt.legend(bbox_to_anchor; plt.show()
plt.clf()= sb.lineplot(data = (tongue[(tongue["location"] != "Canada") &
not_eng_plot_2 "Mother tongue"] == "Non-official language")]),
(tongue[= "year",
x = "Percentage",
y = "location")
hue "Percentage of non-official "
not_eng_plot_2.set_title("language mother tongues in different provinces")
=(1.05, 1),
plt.legend(bbox_to_anchor='upper left',
loc=0)
borderaxespad; plt.show()
# Plot the French and non-official language speakers percentages among all provinces in Canada
|> subset(location != "Canada" & Mother.tongue == "French")|>
tongue ggplot(aes(x = year, y = Percentage, group = location)) +
labs(y = "Percentage", title = "Percentage of French mother tongues in different provinces") +
geom_line(aes(color = location))
|> subset(location != "Canada" & Mother.tongue == "Non-official languages")|>
tongue ggplot( aes(x = year, y = Percentage, group = location)) +
labs(y = "Percentage", title = "Percentage of non-official language mother tongues in different provinces") +
geom_line(aes(color = location))