import requests, zipfile, io
import tempfile as tf
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
Production and Sale of Greenhouse Flowers and Plants - Open Canada
Data Provider
Statistics Canada’s Open Government is a free and open-access platform containing over 80,000 datasets across diverse subjects. The purpose of sharing all data documents with the public is to remain transparent and accessible.
Dataset can be discovered by multiple searching methods here, such as Browse by subject, Open Government Portal for direct keywords search, Open Maps which contains geospatial information data, Open Data Inventory from the government of Canada organization, Apps Gallery for representing those mobile and web-based application data, Open Data 101 for letting people know how to use dataset and so on.
Production and Sales of Greenhouse flowers and plants
The annual production and sales of different types of flowers and plants from 2007 to 2019 in Canada is compiled with its metadata file here. The metadata contains detailed variable descriptions. The dataset and supporting documentation can are available from Statistics Canada.
Variables include years from 2007 to 2019, province of production, flower and plant types, and production count and sales.
Libraries
library(tidyverse)
library(GGally)
Organizing Dataset
The following code is used to obtain and organize the dataset and separate it into two by output type (sales and production).
# Download the zip file of plant sales and production
= tf.TemporaryFile()
temp = "https://www150.statcan.gc.ca/n1/tbl/csv/32100246-eng.zip"
url = requests.get(url)
r = zipfile.ZipFile(io.BytesIO(r.content))
temp = temp.namelist()
file_list print(file_list)
['32100246.csv', '32100246_MetaData.csv']
= pd.read_csv(temp.open(file_list[0]))
plants
# Rename a couple of columns
= plants.rename(columns = {"GEO" : "location", "REF_DATE" : "year"}) plants
# Download the zip file of plant sales and production
<- tempfile()
temp download.file("https://www150.statcan.gc.ca/n1/tbl/csv/32100246-eng.zip",temp)
<- as.character(unzip(temp, list = TRUE)$Name)) (file_list
[1] "32100246.csv" "32100246_MetaData.csv"
<- read_csv(unz(temp, file_list[1]))
plants unlink(temp) # Delete temp file
# Rename a couple of columns
<- plants |>
plants rename(year = REF_DATE, location = GEO) |> # personal preference for these names
rename_all(make.names)# R friendly naming that replaces spaces with '.'
Greenhouse plants production and sales in Canada
The following code is used to plot greenhouse plant production and sales in Canada
# Subset a dataset of plants production in Canada
"darkgrid")
sb.set_style(
= plants[(plants["location"] == "Canada") &
sb.lineplot(data "Output"] == "Production (number)")],
(plants[= "year",
x = "VALUE",
y = "Flowers and plants")
hue "Production (number)")
plt.ylabel("Greenhouse plants production in Canada")
plt.title(= "Flowers and Plants",
plt.legend(title =(1.05, 1),
bbox_to_anchor='upper left',
loc=0)
borderaxespad; plt.show()
plt.clf()= plants[(plants["location"] == "Canada") &
sb.lineplot(data "Output"] == "Sales")],
(plants[= "year",
x = "VALUE",
y = "Flowers and plants")
hue "Production (number)")
plt.ylabel("Greenhouse plants sales in Canada")
plt.title(= "Flowers and Plants",
plt.legend(title =(1.05, 1),
bbox_to_anchor='upper left',
loc=0)
borderaxespad; plt.show()
# Subset a dataset of plants production in Canada
|> subset(location == "Canada" & Output =="Production (number)") |>
plants ggplot( aes(x = year, y = VALUE, group = Flowers.and.plants)) +
labs(y = "Production (number)", title = "Greenhouse plants production in Canada") +
geom_line(aes(color = Flowers.and.plants))
|> subset(location == "Canada" & Output =="Sales") |>
plants ggplot( aes(x = year, y = VALUE, group = Flowers.and.plants)) +
labs(y = "Sales ($ dollars)", title = "Greenhouse plants sales in Canada") +
geom_line(aes(color = Flowers.and.plants))
- So, the following code is used to discover the plant production in different provinces.
Bedding plants (vegetables & ornamental) production
# Select the datasets of two different bedding plants (vegetable & ornamental) in 10 different provinces
plt.clf()= plants[(plants["location"] != "Canada") &
sb.lineplot(data "Output"] == "Production (number)") &
(plants["Flowers and plants"] ==
(plants["Total vegetable bedding plants [115143421]")],
= "year",
x = "VALUE",
y = "location")
hue "Number Produced")
plt.ylabel("Total vegetable bedding plants production")
plt.title(= "Province",
plt.legend(title =(1.05, 1),
bbox_to_anchor='upper left',
loc=0)
borderaxespad; plt.show()
plt.clf()= plants[(plants["location"] != "Canada") &
sb.lineplot(data "Output"] == "Production (number)") &
(plants["Flowers and plants"] == "Total ornamental bedding plants [115143411]")],
(plants[= "year",
x = "VALUE",
y = "location")
hue "Number Produced")
plt.ylabel("Total ornamental bedding plants production")
plt.title(= "Province",
plt.legend(title =(1.05, 1),
bbox_to_anchor='upper left',
loc=0)
borderaxespad; plt.show()
# Select the datasets of two different bedding plants (vegetable & ornamental) in 10 different provinces
|> subset(location != "Canada" & Output =="Production (number)" & Flowers.and.plants =="Total vegetable bedding plants [115143421]") |>
plants ggplot( aes(x = year, y = VALUE, group = location)) +
geom_line(aes(color = location)) +
labs(y = "Number produced")+
ggtitle("Total vegetable bedding plants production")
|> subset(location != "Canada" & Output =="Production (number)" & Flowers.and.plants =="Total ornamental bedding plants [115143411]") |>
plants ggplot( aes(x = year, y = VALUE, group = location)) +
geom_line(aes(color = location)) +
labs(y = "Number produced")+
ggtitle("Total ornamental bedding plants production")
Examining all production counts within Canada
= plants[(plants["location"] == "Canada") &
plants_2 "Output"] == "Sales")][["year",
(plants["Flowers and plants",
"VALUE"]]
= pd.pivot_table(plants_2,
plants_2 = "VALUE",
values = "Flowers and plants",
columns = "year").reset_index()
index = plants_2);
sb.pairplot(data
|> subset(location == "Canada" & Output =="Production (number)") |>
plants select(year, Flowers.and.plants, VALUE)|>
pivot_wider(names_from= Flowers.and.plants,values_from=VALUE) |>
ggpairs()