I am here as an individual, long-active Open Source developer and data scientist. Views and opinions expressed are solely my own.
docker run --rm -it -p 8888:8888 \
rpy2/gbdc-ppda-slides
Visit http://localhost:8888/
*
*: If docker-machine (Windows, OS X), the address is:
docker-machine ip [MACHINE]
cursor = dbcon.cursor()
sql = """
SELECT reviewhelpful, count(*)
FROM review
WHERE reviewscore < 2
GROUP BY reviewhelpful
"""
cursor.execute(sql)
| C | C++ | bash | Javascript | Julia | Python | R | Scala | SQL |
| Julia | ⇒ | Python | : | PyCall |
| Julia | ⇒ | R | : | Rif or RCall |
| [somewhere] | ⇒ | R | : | Rserve |
| R | ⇒ | Python | : | rPython |
Polyglot data analysis a project/ensemble with:
|
Detailed account of raw materials and workdays for a basketry workshop. Clay, ca. 2040 BC (Ur III).
Source: wikipedia Marie-Lan Nguyen (2006) |
J. McAuley and J. Leskovec. From amateurs to connoisseurs: modeling the evolution of user expertise through online reviews. WWW, 2013. http://snap.stanford.edu/data/web-FineFoods.html
product/productId: B00...W
review/userId: A1S...MSMR
review/profileName: [some profile name]
review/helpfulness: 0/0
review/score: 1.0
review/time: 1344556800
review/summary: Sad outcome
review/text: Five minutes in, one tentacle was bitten off, ball inside
cracked in half. Not durable enough to be a dog toy.
Disappointed :(. So is the dog :(.
| productid | VARCHAR |
| reviewerid | VARCHAR |
| reviewername | VARCHAR |
| reviewhelpful | VARCHAR |
| reviewscore | INTEGER |
| reviewtime | INTEGER |
| reviewsummary | VARCHAR |
| reviewtext | VARCHAR |
import sqlite3
dbfilename = "/opt/data/finefoods.db"
dbcon = sqlite3.connect(dbfilename)
cursor = dbcon.cursor()
sql = """
SELECT reviewhelpful, count(*)
FROM review
WHERE reviewscore < 2
GROUP BY reviewhelpful
"""
cursor.execute(sql)
<sqlite3.Cursor at 0x7f998441d1f0>
from collections import Counter
ct = Counter()
for row_n, (score, count) in enumerate(cursor, 1):
ct[score] = count
print(ct.most_common(n=3))
[('0/0', 12266), ('1/1', 4809), ('0/1', 3701)]
sql = """
select reviewhelpful, count(*)
from (select T.reviewername
from (select reviewername, count(*) as reviewer_count
from review
group by reviewername) as T
where T.reviewer_count > 5) as U
inner join review
on U.reviewername = review.reviewername
where reviewscore < 2
group by reviewhelpful
"""
cursor.execute(sql)
<sqlite3.Cursor at 0x7f998441d2d0>
Complex queries require increasingly complex SQL
⇨ So we need to know SQL, don't we ?
|
result = (Review
.select()
.where(Review.reviewscore < 2))
(example with SQLObject)
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
Base = automap_base()
# engine, suppose it has two tables 'user' and 'address' set up
engine = create_engine("sqlite:////opt/data/finefoods.db")
Base.prepare(engine, reflect=True)
review = Base.classes.review
session = Session(engine)
from sqlalchemy import func # SQL functions
query = (session
.query(review.reviewhelpful,
func.count(review.reviewhelpful))
.filter(review.reviewscore < 2)
.group_by(review.reviewhelpful)
.order_by(func.count(review.reviewhelpful).desc()))
res = query.all()
res[:3]
[('0/0', 12266), ('1/1', 4809), ('0/1', 3701)]
from sqlalchemy.dialects import sqlite
print(str(query.statement.compile(dialect=sqlite.dialect()))
.replace('GROUP BY', '\nGROUP BY'))
SELECT review.reviewhelpful, count(review.reviewhelpful) AS count_1 FROM review WHERE review.reviewscore < ? GROUP BY review.reviewhelpful ORDER BY count(review.reviewhelpful) DESC
| mutate | modify/add column |
| filter | filter rows |
| select | select columns |
| group_by | group rows |
| summarize | summarize groups of rows |
(RStudio)
datasrc <- src_sqlite(dbfilename)
review_tbl <- tbl(datasrc, "review")
res <- filter(review_tbl,
reviewscore < 2) %>%
count(reviewhelpful) %>%
arrange('desc(n)')
from rpy2.robjects.lib import dplyr
datasrc = dplyr.src_sqlite(dbfilename)
review_tbl = datasrc.get_table("review")
res = (review_tbl
.filter('reviewscore < 2')
.count('reviewhelpful')
.arrange('desc(n)'))
print(res)
Source: query [?? x 2]
Database: sqlite 3.8.6 [/opt/data/finefoods.db]
reviewhelpful n
<chr> <int>
1 0/0 12266
2 1/1 4809
3 0/1 3701
4 1/2 2718
5 2/2 2123
6 0/2 1702
7 2/3 1579
8 1/3 1342
9 3/3 1104
10 2/4 921
# ... with more rows
from rpy2.robjects import r, globalenv
import rpy2.robjects.lib.ggplot2 as gg
split_review = \
'as.integer(sapply(strsplit(reviewhelpful, "/"), "[", %i))'
p = (gg.ggplot((res
.collect()
.mutate(r1 = split_review % 1,
r2 = split_review % 2))) +
gg.aes_string(x='r1+1', y='r2+1', size='n') +
gg.geom_point(alpha = 0.5) +
gg.scale_x_log10("review voted helpful") +
gg.scale_y_log10("review") +
gg.scale_size(trans="sqrt"))
from rpy2.robjects.lib import grdevices
with grdevices.render_to_bytesio(grdevices.png,
width=800,
height=600,
res=120) as b:
p.plot()
from IPython.display import Image, display
data = b.getvalue()
display(Image(data=data, format='png', embed=True))
from bokeh.plotting import figure
from bokeh.plotting import figure, show, output_notebook
##from bokeh.resources import INLINE
output_notebook()
plot = figure()
res = (review_tbl
.filter('reviewscore < 2')
.count('reviewhelpful')
.collect())
import math
plot.scatter(list(int(x.split('/')[0]) \
for x in res.rx2('reviewhelpful')),
list(int(x.split('/')[1]) \
for x in res.rx2('reviewhelpful')),
radius=list(math.log10(x/100) for x in res.rx2('n')),
fill_alpha=.3)
import findspark
findspark.init()
import pyspark
conf = pyspark.conf.SparkConf()
(conf.setMaster('local[2]')
.setAppName('ipython-notebook')
.set("spark.executor.memory", "2g"))
sc = pyspark.SparkContext(conf=conf)
from pyspark.sql import SQLContext, Row
sqlcontext = SQLContext(sc)
cursor.execute('select * from review limit 10000')
review = \
sqlcontext.createDataFrame(cursor,
tuple(x[0] for x in cursor.description))
review.registerTempTable("review")
The data is now distributed on a Spark cluster.
sql = """
SELECT reviewhelpful, count(*) AS ct
FROM review
WHERE reviewscore < 2
GROUP BY reviewhelpful
ORDER BY ct DESC
"""
counts = sqlcontext.sql(sql)
res = counts.collect()
res[:3]
[Row(reviewhelpful='0/0', ct=241), Row(reviewhelpful='1/1', ct=87), Row(reviewhelpful='0/1', ct=80)]
names = (review
.rdd
.flatMap(lambda p: p.reviewsummary.split())
.map(lambda word: (word.lower(), 1))
.reduceByKey(lambda a, b: a+b))
names.takeOrdered(10, key = lambda x: -x[1])
[('great', 1259),
('the', 914),
('good', 837),
('for', 706),
('a', 695),
('not', 624),
('and', 581),
('best', 568),
('my', 451),
('coffee', 438)]
from collections import Counter
bad_tokens = set(('.', '!'))
def bigrams(p):
key = p.reviewerid
tokens = tuple(x.lower() \
for x in p.reviewsummary.split() \
if x not in bad_tokens)
ct = Counter('%s %s' % (tokens[i], tokens[i+1]) \
for i in range(len(tokens)-1))
return ((key, k, v) for k, v in ct.items())
names = (review
.rdd
.flatMap(bigrams))
names.takeOrdered(10, key = lambda x: -x[2])
[('A1DTDSL1J6C05F', 'love love', 2),
('A2LS9RBSI8SDZF', 'can not', 2),
('A2A1XYSB692L6J', 'a r', 2),
('A2A1XYSB692L6J', 'h i', 2),
('A212N0TUR007EK', "'made in", 2),
('ABTUNH7645QJL', 'a lot', 2),
('ABTUNH7645QJL', 'lot of', 2),
('ABTUNH7645QJL', 'not a', 2),
('A1QEZ01P4PH82D', 'hot hot', 2),
('A2U51T1TK6CYX9', '2 different', 2)]
names = (review
.rdd
.flatMap(bigrams)
.map(lambda x: (x[1], x[2]))
.reduceByKey(lambda a,b: a+b))
names.takeOrdered(15, key = lambda x: -x[1])
[('the best', 208),
('great for', 88),
('but not', 86),
('great product', 84),
('love this', 75),
('my favorite', 73),
('very good', 73),
('a great', 71),
('this is', 70),
('of the', 69),
('good for', 62),
('gluten free', 58),
('for the', 57),
('better than', 56),
('dog food', 55)]
sql = """
SELECT *
FROM review
WHERE reviewscore < 2
"""
lowscore = sqlcontext.sql(sql)
names = (lowscore
.rdd
.flatMap(bigrams)
.map(lambda x: (x[1], x[2]))
.reduceByKey(lambda a,b: a+b))
names.takeOrdered(5, key = lambda x: -x[1])
[('not good', 16),
('do not', 12),
('in the', 12),
('the worst', 11),
('waste of', 10)]
## --- SQL ---
sql = "SELECT * FROM review WHERE reviewscore < 2"
lowscore = sqlcontext.sql(sql)
## --- Spark ---
names = (lowscore
.rdd
.flatMap(bigrams)
.map(lambda x: (x[1], x[2]))
.reduceByKey(lambda a,b: a+b))
lst = names.takeOrdered(20, key = lambda x: -x[1])
## --- R (from Python) ---
from rpy2.robjects.vectors import StrVector, IntVector
dataf = dplyr.DataFrame({'word_pair': StrVector([x[0] for x in lst]),
'count': IntVector([x[1] for x in lst])})
p = (gg.ggplot(dataf.arrange('desc(count)')) +
gg.geom_bar(gg.aes_string(x='factor(word_pair, levels=word_pair)',
y='count'),
stat="identity") +
gg.scale_x_discrete('word pair') +
gg.theme(**{'axis.text.x': gg.element_text(angle = 90, hjust = 1)}))
with grdevices.render_to_bytesio(grdevices.png,
width=800, height=600,
res=120) as b:
p.plot()
display(Image(data=b.getvalue(), format='png', embed=True))