I am here as an individual, long-active Open Source developer and data scientist. Views and opinions expressed are solely my own.
docker run --rm -it -p 8888:8888 \
lgautier/pragmatic-polyglot-data-analysis
cursor = dbcon.cursor()
sql = """
SELECT state, count(city)
FROM location
WHERE state LIKE 'M%'
GROUP BY state
"""
cursor.execute(sql)
C | C++ | bash | Javascript | Julia | Python | R | Scala | SQL |
Julia | ⇒ | Python | : | PyCall |
Julia | ⇒ | R | : | Rif or RCall |
[somewhere] | ⇒ | R | : | Rserve |
R | ⇒ | Python | : | rPython |
Polyglot data analysis a project/ensemble with:
Detailed account of raw materials and workdays for a basketry workshop. Clay, ca. 2040 BC (Ur III). Source: wikipedia Marie-Lan Nguyen (2006) |
! git clone https://github.com/lgautier/project-tycho-utilities.git
fatal: destination path 'project-tycho-utilities' already exists and is not an empty directory.
! cd project-tycho-utilities/ && DBNAME=../tycho.db make all
make[1]: Entering directory '/opt/local/render/project-tycho-utilities' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/opt/local/render/project-tycho-utilities'
|
|
|
import sqlite3
dbfilename = "tycho.db"
dbcon = sqlite3.connect(dbfilename)
cursor = dbcon.cursor()
sql = """
SELECT state, city
FROM location
WHERE state LIKE 'M%'
"""
cursor.execute(sql)
<sqlite3.Cursor at 0x7f65dc575340>
from collections import Counter
ct = Counter()
for row_n, (state, city) in enumerate(cursor, 1):
ct[state] += 1
print(ct.most_common(n=5))
[('MA', 67), ('MI', 26), ('MN', 10), ('ME', 9), ('MO', 8)]
sql = """
SELECT state, count(city) city_count
FROM (select D.location_id
FROM (select location_id, COUNT(DISTINCT(disease_id)) AS disease_count
FROM casecount
GROUP BY location_id) AS D
WHERE D.disease_count > 5) AS HD
INNER JOIN location
ON HD.location_id = location.id
GROUP BY state
ORDER BY city_count DESC
LIMIT 5
"""
cursor.execute(sql)
<sqlite3.Cursor at 0x7f65dc575490>
Complex queries require increasingly complex SQL
⇨ So we need to know SQL, don't we ?
|
result = (Location
.select()
.where(Location.state.startswith('M')))
(example with SQLObject)
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
Base = automap_base()
# engine, suppose it has two tables 'user' and 'address' set up
engine = create_engine("sqlite:///tycho.db")
Base.prepare(engine, reflect=True)
location = Base.classes.location
session = Session(engine)
from sqlalchemy import func # SQL functions
query = (session
.query(location.state,
func.count(location.city))
.filter(location.state.like('M%'))
.group_by(location.state)
.order_by(func.count(location.city).desc())
.limit(5))
res = query.all()
from sqlalchemy.dialects import sqlite
print(str(query.statement.compile(dialect=session.bind.dialect))
.replace('GROUP BY', '\nGROUP BY'))
SELECT location.state, count(location.city) AS count_1 FROM location WHERE location.state LIKE ? GROUP BY location.state ORDER BY count(location.city) DESC LIMIT ? OFFSET ?
mutate | modify/add column |
filter | filter rows |
select | select columns |
group_by | group rows |
summarize | summarize groups of rows |
datasrc <- src_sqlite(dbfilename)
location_tbl <- tbl(datasrc, "location")
res <- filter(location_tbl,
state %like% 'M%') %>%
count(city) %>%
arrange('desc(n)')
from rpy2.robjects.lib import dplyr
datasrc = dplyr.src_sqlite(dbfilename)
location_tbl = datasrc.get_table("location")
res = (location_tbl
.filter('state %like% "M%"')
.group_by('state')
.count('state')
.arrange('desc(n)'))
print(res)
Source: query [?? x 2] Database: sqlite 3.8.6 [tycho.db] state n <chr> <int> 1 MA 67 2 MI 26 3 MN 10 4 ME 9 5 MO 8 6 MT 7 7 MD 5 8 MS 3 9 MP 1
casecount_tbl = datasrc.get_table("casecount")
##
disease_per_city = (casecount_tbl
.group_by('location_id')
.summarize(n='count(distinct(disease_id))'))
##
high_disease = (disease_per_city
.filter('n > 5'))
##
inner_join = dplyr.dplyr.inner_join
join_location = inner_join((location_tbl
.mutate(location_id="id")),
high_disease,
by="location_id")
res = (dplyr.DataFrame(join_location)
.group_by('state')
.summarize(n='count(city)')
.arrange('desc(n)')
.collect())
from rpy2.robjects import r, globalenv
import rpy2.robjects.lib.ggplot2 as gg
p = (gg.ggplot(res.head(20)) +
gg.geom_bar(gg.aes_string(x='factor(state, levels=as.character(state))',
y='n'),
stat='identity') +
gg.scale_x_discrete("State") +
gg.scale_y_sqrt("# cities w/ >5 diseases"))
from rpy2.ipython.ggplot import image_png
from IPython.display import display
display(image_png(p))
from rpy2.robjects import baseenv
state_abb = (dplyr.DataFrame({'state': baseenv.get('state.abb'),
'region': baseenv.get('state.name')})
.mutate(region = 'tolower(region)'))
from rpy2.robjects.packages import importr
maps = importr('maps')
states = gg.map_data('state')
merge = baseenv.get('merge')
states_map = merge(states, state_abb, sort=False, by="region")
dataf_plot = merge(states_map, res, sort=False, by="state")
dataf_plot = dplyr.DataFrame(dataf_plot).arrange('order')
p = (gg.ggplot(dataf_plot) +
gg.geom_polygon(gg.aes_string(x='long', y='lat', group='group', fill='n')) +
gg.scale_fill_continuous(trans="sqrt") +
gg.coord_map("albers", at0 = 45.5, lat1 = 29.5))
display(image_png(p))
sql_disease = """
SELECT date_from, count, city
FROM casecount
INNER JOIN disease
ON casecount.disease_id=disease.id
INNER JOIN location
ON casecount.location_id=location.id
WHERE disease.name='%s'
AND state='%s'
AND city IS NOT NULL
"""
sql = sql_disease % ('PNEUMONIA', 'MA')
dataf = dplyr.DataFrame(dplyr.tbl(datasrc, dplyr.dplyr.sql(sql))).collect()
dataf = dataf.mutate(date='as.POSIXct(strptime(date_from, format="%Y-%m-%d"))')
dataf = dataf.mutate(month = 'format(date, "%m")',
year = 'format(date, "%Y")')
# sum per month
dataf_plot = (dataf
.group_by('city', 'month','year')
.summarize(count='sum(count)'))
#
yearmonth_to_date = """
as.POSIXct(
strptime(
paste(year, month, "15", sep="-"),
format="%Y-%m-%d")
)
"""
dataf_plot = dataf_plot.mutate(date=yearmonth_to_date)
p = (gg.ggplot(dataf_plot
.filter('year > 1925')) +
gg.geom_line(gg.aes_string(x='month', y='count',
group='year', color='city')) +
gg.facet_grid('city~.', scales="free_y") +
gg.scale_y_sqrt())
display(image_png(p))
def foo(disease, state):
sql = sql_disease % (disease, state)
dataf = dplyr.DataFrame(dplyr.tbl(datasrc, dplyr.dplyr.sql(sql))).collect()
dataf = dataf.mutate(date='as.POSIXct(strptime(date_from, format="%Y-%m-%d"))')
dataf = dataf.mutate(month = 'format(date, "%m")',
year = 'format(date, "%Y")')
dataf_plot = (dataf
.group_by('city', 'month','year')
.summarize(count='sum(count)'))
dataf_plot = dataf_plot.mutate(date=yearmonth_to_date)
p = (gg.ggplot(dataf_plot
.filter('year > 1925')) +
gg.geom_line(gg.aes_string(x='month', y='count+1',
group='year', color='city')) +
gg.facet_grid('city~.', scales="free_y") +
gg.scale_y_sqrt() +
gg.ggtitle(disease))
display(image_png(p, height=600))
from ipywidgets import interact
interact(foo,
disease=('PNEUMONIA','INFLUENZA','MEASLES'),
state=('MA','NH','CA'))
<function __main__.foo>
from bokeh.plotting import (figure, show,
ColumnDataSource,
output_notebook)
##from bokeh.resources import INLINE
output_notebook()
res = res.head(20)
plot = figure(x_range=list(res.rx2('state')))
source = ColumnDataSource(dict((x, tuple(res.rx2(x))) for x in res.colnames))
plot.vbar(x='state',
bottom=0, top='n',
width=0.5,
color='STEELBLUE',
source=source)
import findspark
findspark.init()
import pyspark
conf = pyspark.conf.SparkConf()
(conf.setMaster('local[2]')
.setAppName('ipython-notebook')
.set("spark.executor.memory", "2g"))
sc = pyspark.SparkContext(conf=conf)
from pyspark.sql import SQLContext, Row
sqlcontext = SQLContext(sc)
cursor.execute("SELECT * FROM location")
location = \
sqlcontext.createDataFrame(cursor,
tuple(x[0] for x in cursor.description))
location.registerTempTable("location")
sql = """
SELECT *
FROM (SELECT * FROM disease WHERE name='PNEUMONIA') AS disease
INNER JOIN casecount
ON disease.id=casecount.disease_id"""
cursor.execute(sql)
casecount = \
sqlcontext.createDataFrame(cursor,
tuple(x[0] for x in cursor.description))
casecount.registerTempTable("casecount")
The data is now distributed on a Spark cluster.
sql = """
SELECT state, count(city) AS ct
FROM location
GROUP BY state
ORDER BY ct DESC
"""
counts = sqlcontext.sql(sql)
res = counts.collect()
res[:3]
[Row(state='PA', ct=75), Row(state='MA', ct=66), Row(state='OH', ct=46)]
names = (location
.filter(location.city.isNotNull())
.rdd
.flatMap(lambda rec: [x[-5:] for x in rec.city.split()])
.map(lambda word: (word.lower(), 1))
.reduceByKey(lambda a, b: a+b))
names.takeOrdered(10, key = lambda x: -x[1])
[('ville', 26), ('ngton', 18), ('city', 16), ('field', 15), ('sburg', 13), ('new', 13), ('ester', 11), ('stown', 7), ('west', 7), ('st', 6)]
sql = """
SELECT city
FROM (SELECT * FROM casecount WHERE epiweek LIKE '1912%') AS sub
INNER JOIN location
ON location.id=sub.location_id
GROUP BY city
"""
y_1912 = sqlcontext.sql(sql)
names = (y_1912
.filter(y_1912.city.isNotNull())
.rdd
.flatMap(lambda rec: [x[-5:] for x in rec.city.split()])
.map(lambda word: (word.lower(), 1))
.reduceByKey(lambda a,b: a+b))
names.takeOrdered(5, key = lambda x: -x[1])
[('ville', 6), ('new', 4), ('ngton', 4), ('stown', 4), ('san', 3)]
## --- SQL ---
sql = """
SELECT state, city, date_from, count AS ct
FROM (SELECT * FROM casecount WHERE epiweek LIKE '1912%') AS sub
INNER JOIN location
ON location.id=sub.location_id
"""
y_1912 = sqlcontext.sql(sql)
## --- Spark ---
cases = (y_1912
.rdd
.map(lambda rec: ((rec.state, int(rec.date_from.split('-')[1])),
rec.ct))
.reduceByKey(lambda a,b: a+b)).collect()
## --- R (from Python) ---
from rpy2.robjects import StrVector, IntVector, FactorVector, Formula
months = StrVector([str(x) for x in range(1,13)])
res = dplyr.DataFrame({'state': StrVector([x[0][0] for x in cases]),
'month': FactorVector([x[0][1] for x in cases],
levels = months),
'count': IntVector([x[1] for x in cases])})
dataf_plot = merge(states_map, res, all_x=True, sort=False, by="state")
dataf_plot = dplyr.DataFrame(dataf_plot).arrange('order')
jetcols = StrVector(("#00007F", "#007FFF", "#7FFF7F", "#FF7F00", "#7F0000"))
p = (gg.ggplot(dataf_plot) +
gg.geom_polygon(gg.aes_string(x='long', y='lat',
group='group', fill='count')) +
gg.coord_map("albers", at0 = 45.5, lat1 = 29.5) +
gg.scale_fill_gradientn(colors=jetcols, trans='sqrt') +
gg.facet_wrap(facets=Formula("~month")) +
gg.ggtitle("Cases of Pneumonia in 1912"))
display(image_png(p))
dataf_now = dataf_plot.filter('month %in% c(9,10)')
p = (gg.ggplot(dataf_now) +
gg.geom_polygon(gg.aes_string(x='long', y='lat',
group='group', fill='count')) +
gg.coord_map("albers", at0 = 45.5, lat1 = 29.5) +
gg.scale_fill_gradientn(colors=jetcols, trans='sqrt') +
gg.facet_wrap(facets=Formula("~month")) +
gg.ggtitle("Cases of Pneumonia in 1912"))
display(image_png(p))
dataf_now = (dataf_plot
.filter('month %in% c(9,10)',
'state %in% c("CT", "NY", "MA", "NJ", "NH", "VM")'))
p = (gg.ggplot(dataf_now) +
gg.geom_polygon(gg.aes_string(x='long', y='lat',
group='group', fill='count')) +
gg.coord_map("albers", at0 = 45.5, lat1 = 29.5) +
gg.scale_fill_gradientn(colors=jetcols, trans='sqrt') +
gg.facet_wrap(facets=Formula("~month")) +
gg.ggtitle("Cases of Pneumonia in 1912"))
display(image_png(p))