library(tidyverse)
library(viridis)
library(hrbrthemes)
library(GGally)
Private Elementary and Secondary Schools Revenues - Open Canada
Data Provider
The provincial government of Ontaio provides open access to thousands of data sets via their Open Data Ontario portal. The purpose of sharing all data documents with the public is to remain transparent and accessible. More details about the data license, training materials, and other information can be found here.
Private Elementary and Secondary Schools Revenues, by Direct Source of Funds
This dataset contains 138 observations across provinces and funding sources from 1947 to 2002.
The dataset and its metadata file which contains detailed variable descriptions have been stored together as a zip file. The data resource and its description can be found here, you also can quickly discover the customized table here.
Libraries
import requests, zipfile, io
import tempfile as tf
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
Organizing Dataset
The following code is used to download and organize the original dataset, and only select the important variables to form a new dataset.
# Download the zip file of school revenues dataset
<- tempfile()
temp download.file("https://www150.statcan.gc.ca/n1/tbl/csv/37100084-eng.zip",temp)
<- as.character(unzip(temp, list = TRUE)$Name)) (file_list
[1] "37100084.csv" "37100084_MetaData.csv"
<- read_csv(unz(temp, "37100084.csv"))
revenue unlink(temp) # Delete temp file
# Organize the dataset
<- revenue |>
revenue separate(REF_DATE, c("year", "year.end"), "/") |>
mutate(year = as.numeric(year))|>
drop_na(VALUE) |>
rename_all(make.names)
# take a look at the data:
|> glimpse() revenue
Rows: 4,163
Columns: 16
$ year <dbl> 1947, 1947, 1947, 1947, 1947, 1947, 1947, 1947,…
$ year.end <chr> "1948", "1948", "1948", "1948", "1948", "1948",…
$ GEO <chr> "Canada", "Canada", "Canada", "Canada", "Canada…
$ DGUID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Direct.source.of.funds <chr> "Total revenues", "Government sources", "Federa…
$ UOM <chr> "Dollars", "Dollars", "Dollars", "Dollars", "Do…
$ UOM_ID <dbl> 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81,…
$ SCALAR_FACTOR <chr> "thousands", "thousands", "thousands", "thousan…
$ SCALAR_ID <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ VECTOR <chr> "v1444638", "v1444642", "v1444639", "v1444640",…
$ COORDINATE <dbl> 1.1, 1.5, 1.2, 1.3, 1.4, 1.8, 1.9, 1.1, 6.1, 6.…
$ VALUE <dbl> 13951, 0, 0, 0, 0, 4114, 0, 9837, 743, 0, 0, 0,…
$ STATUS <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ SYMBOL <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ TERMINATED <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ DECIMALS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Download the zip file of plant sales and production
= tf.TemporaryFile()
temp = "https://www150.statcan.gc.ca/n1/tbl/csv/37100084-eng.zip"
url = requests.get(url)
r = zipfile.ZipFile(io.BytesIO(r.content))
temp = temp.namelist()
file_list print(file_list)
['37100084.csv', '37100084_MetaData.csv']
= pd.read_csv(temp.open("37100084.csv"))
revenue
"year", "year end"]] = revenue["REF_DATE"].str.split("/", n=1, expand = True)
revenue[[= revenue.dropna(subset = ["VALUE"])
revenue "year"] = revenue["year"].astype(int)
revenue[
revenue.head()
REF_DATE GEO DGUID ... DECIMALS year year end
0 1947/1948 Canada NaN ... 0 1947 1948
1 1947/1948 Canada NaN ... 0 1947 1948
2 1947/1948 Canada NaN ... 0 1947 1948
3 1947/1948 Canada NaN ... 0 1947 1948
4 1947/1948 Canada NaN ... 0 1947 1948
[5 rows x 17 columns]
School Revenue by Different Funded Source in Canada
The following code is used to draw the stacked area chart so that we can directly see the time-series trend and proportion of different funding sources over the years.
# Subset a reveune dataset which only focus on different funding source in Canada and plot the graph
|> subset( GEO == "Canada" & Direct.source.of.funds != "Total revenues")|>
revenue ggplot( aes(x=year, y=VALUE, group=Direct.source.of.funds, colour =Direct.source.of.funds)) +
geom_line(alpha=0.6 , size=.5) +
scale_fill_viridis(discrete = T) +
theme_ipsum() +
labs(title = "School Revenue Funded \n from Different Sources")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Subset a reveune dataset which only focus on different funding source in Canada and plot the graph
"darkgrid")
sb.set_style(
= revenue[(revenue["GEO"] == "Canada") &
sb.lineplot(data "Direct source of funds"] != "Total revenues")],
(revenue[= "year",
x = "VALUE",
y = "Direct source of funds")
hue "School Revenue Funded from Different Sources")
plt.title(= "Direct source of funds",
plt.legend(title =(1.05, 1),
bbox_to_anchor='upper left',
loc=0)
borderaxespad; plt.show()
Total School Revenue in Different Provinces
The following code is used to draw the stacked area chart so that we can directly see the time-series trend and proportion in different provinces over the years.
# Subset a reveune dataset which only focuses on different funding source in Canada and plot the graph
|> subset( GEO != "Canada" & Direct.source.of.funds == "Total revenues")|>
revenue ggplot( aes(x=year, y=VALUE, group=GEO, colour =GEO)) +
geom_line(alpha=0.6 , size=.5) +
scale_fill_viridis(discrete = T) +
theme_ipsum() +
ggtitle("Total School Revenue Funded \n in Different Provinces")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plt.clf()= revenue[(revenue["GEO"] != "Canada") &
sb.lineplot(data "Direct source of funds"] == "Total revenues")],
(revenue[= "year",
x = "VALUE",
y = "GEO")
hue "Total School Revenue Funded in Different Provinces")
plt.title(= "Province",
plt.legend(title =(1.05, 1),
bbox_to_anchor='upper left',
loc=0)
borderaxespad; plt.show()
Tidy the variables
There are quite a few variables that aren’t really of interest like SCALAR_ID and COORDINATE, while other variables are hidden within the single column Direct.source.of.funds. Here we wish to rearrange the data from a tall format into a wider format. The function pivot_wider takes a column and spreads its factors into new columns. This makes the observational unit combinations of year and GEO.
# select the columns of interest
= revenue |>
revenue_wide select(year, GEO, Direct.source.of.funds, VALUE) |>
# pivot wider by taking factors from Direct.source.of.funds
# fill in the variable with values from VALUE
pivot_wider(names_from = Direct.source.of.funds, values_from = VALUE) |>
# rebuild the variable names so that they do not have spaces
rename_all(make.names)
|> glimpse() revenue_wide
Rows: 452
Columns: 12
$ year <dbl> 1947, 1947, 1947, 1947, 1947, 1947, 1947,…
$ GEO <chr> "Canada", "Atlantic Province", "Quebec", …
$ Total.revenues <dbl> 13951, 743, 5900, 4509, 556, 449, 529, 12…
$ Government.sources <dbl> 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0…
$ Federal.government <dbl> 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0…
$ Provincial.government <dbl> 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0…
$ Municipal.government <dbl> 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0…
$ Fees <dbl> 4114, 144, 1770, 1377, 299, 59, 87, 378, …
$ Transfers.from.school.boards <dbl> 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0…
$ Other.revenue.sources <dbl> 9837, 599, 4130, 3132, 257, 390, 442, 887…
$ Tuition.fees <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ Board.and.lodging.fees <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
= revenue[["year",
revenue_wide "GEO",
"Direct source of funds",
"VALUE"]]
= pd.pivot_table(revenue_wide,
revenue_wide = "VALUE",
values = ["year", "GEO"],
index = "Direct source of funds").reset_index()
columns
revenue_wide.head()
Direct source of funds year ... Tuition fees
0 1947 ... NaN
1 1947 ... NaN
2 1947 ... NaN
3 1947 ... NaN
4 1947 ... NaN
[5 rows x 12 columns]
Wider data makes easier plotting and analysis
To predict the school fees, one would need to consider the other sources of funding. Such a regression model can’t be directly made without first widening the data. Below, the variables are plotted against each other and a basic linear model is performed.
|>
revenue_wide filter(GEO=="Canada") |>
ggpairs()
= revenue_wide); sb.pairplot(data
= revenue_wide |>
CanadaWide filter(GEO=="Canada")
lm(Fees~Transfers.from.school.boards+Municipal.government+Provincial.government+Federal.government,CanadaWide)
Call:
lm(formula = Fees ~ Transfers.from.school.boards + Municipal.government +
Provincial.government + Federal.government, data = CanadaWide)
Coefficients:
(Intercept) Transfers.from.school.boards
17835.680 -4.073
Municipal.government Provincial.government
-0.783 2.094
Federal.government
8.957
from sklearn.linear_model import LinearRegression
= revenue_wide[revenue_wide["GEO"] == "Canada"]
canada_wide = canada_wide.dropna()
canada_wide = LinearRegression()
model
= canada_wide[['Transfers from school boards',
x 'Municipal government',
'Provincial government',
'Federal government']]
= canada_wide['Fees']
y
model.fit(x,y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
print("Intercept: ", model.intercept_)
Intercept: 34936.72845660114
print("Coefficients: \n", list(zip(list(x), model.coef_)))
Coefficients:
[('Transfers from school boards', -0.3976651585145225), ('Municipal government', 96.92934345062541), ('Provincial government', 1.6257502677134603), ('Federal government', -16.753463594594237)]