Pandas+postgre sql 實作 with code

From Data to Display
DSP E1
Tim Hong

Tim Hong
E-mail: timhong@dsp.im
的
不DSP
不
不
不
OpenData

IPython Notebook
Hello World
Cell

IPython Notebook
Hello World
File naming
Just like google Doc.

IPython Notebook
Now you try
(1+12)*15

IPython Notebook
Now you try
Sin

IPython Notebook
Now you try
“Sin ”?????!!!!!

IPython Notebook Now you try
“sin”
and try
“cos”

Ubike
iid ItemId
sv Ex.“12”
sd yyyyMMddhhmmss vtyp Ex.“1”.
sno
sna
sip IP
tot Ex.“38”
sbi Ex.“23”
sarea EX.“ ”
mday yyyyMMddhhmmss EX.“20120426132314”
lat Ex.“25.0408388889”
lng 我 Ex.“121.567894444”
ar EX.“ 2 235 ”
sareaen EX.“Xinyi Dist.”
snaen
nbcnt EX,“2”
bemp EX,“12”

broken_df = pd.read_csv(' / /
ubikeCSV.csv',
encoding='latin1',
parse_dates=['id'],
dayﬁrst=True,
index_col='id')

Selection
Multiple columns
Ubike
http://public.enthought.com/~kjordahl/pydata/slides/#1

Adaptor “Psycopg2”
Psycopg2??
SQL type data base

Adaptor “Psycopg2”
Local
⼀一
⼀一⼀一Port
是 Print 是

iPython Notebook SQL
Select ___ , ____ From ____ Where .....
了在你

iPython Pandas
+
PostgreSQL
+
Display

In [14]:
# Import all libraries needed for the tutorial
# General syntax to import specific functions in a library:
##from (library) import (specific library function)
from pandas import DataFrame, read_csv
# General syntax to import a library but no functions:
##import (library) as (give the library a nickname/alias)
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
# Enable inline plotting
%matplotlib inline
%pylab inline
# Must get this or you will get # NameError: name 'figsize' is not defined
import matplotlib.pylab
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
figsize(15, 5)
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__
sql
In [15]:
import pandas.io.sql
import psycopg2
conn = psycopg2.connect(user='lab')
cur = conn.cursor()
print 0 ...
In [16]:
# conn.close()
Populating the interactive namespace from numpy and matplotlib
Python version 2.7.6 (default, Mar 22 2014, 22:59:56)
[GCC 4.8.2]
Pandas version 0.16.0

In [17]:
try:
cur = conn.cursor()
cur.execute('SELECT 1')
except psycopg2.OperationalError:
pass
print conn.closed # 2
In [18]:
# query db
sql = """
select * from ubike limit 3
"""
ubike_df = pandas.io.sql.read_sql(sql, conn)
ubike_df.head() # try
In [19]:
ubike_df.columns
0
Out[18]:
when_ts where_pt code name area_name space_num avg_bike_num max_bike_
0
2014-
12-08
15:00:00
(25.041,121.556945) 2
(2
)
48 24.000 27
1
2014-
12-08
15:00:00
(25.037797,121.565169) 3 40 10.333 13
2
2014-
12-08
15:00:00
(25.036036,121.562325) 4 60 39.333 40
Out[19]:
Index([u'when_ts', u'where_pt', u'code', u'name', u'area_name', u'
space_num', u'avg_bike_num', u'max_bike_num', u'min_bike_num', u'b
ike_num_std', u'avg_space_num', u'max_space_num', u'min_space_num'
, u'space_num_std'], dtype='object')

In [20]:
ubike_df.dtypes
In [31]:
# query db
sql = """
select * from ubike
"""
all = pandas.io.sql.read_sql(sql, conn)
In [33]:
all['avg_bike_num'] .plot(legend=False)
Out[20]:
when_ts datetime64[ns]
where_pt object
code object
name object
area_name object
space_num int64
avg_bike_num float64
max_bike_num int64
min_bike_num int64
bike_num_std float64
avg_space_num float64
max_space_num int64
min_space_num int64
space_num_std float64
dtype: object
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f373b908310>

when_ts datetime64[ns] u' ', u' ', where_pt object u' ', u' ', code object u' ', name
object u' ', area_name object u' ', space_num int64 u' ', avg_bike_num float64
u' ', max_bike_num int64 u' ', min_bike_num int64 u' ', bike_num_std
float64 u' ', avg_space_num float64 u' ', max_space_num int64 u' ',
min_space_num int64 u' ', space_num_std float64 u' '
In [21]:
# query db
sql = """
select * from ubike where name = ' ' and (when_ts BETWEEN '2014-12-08'
AND '2014-12-09')
order by when_ts;
"""
df = pandas.io.sql.read_sql(sql, conn)
len(df)
In [22]:
df
Out[21]:
10

pandas or python
Out[22]:
0
2014-
12-08
15:00:00
(25.049845,121.571885) 15 60 38.667 39
1
2014-
12-08
16:00:00
(25.049845,121.571885) 15 60 40.867 44
2
2014-
12-08
17:00:00
(25.049845,121.571885) 15 60 44.000 48
3
2014-
12-08
18:00:00
(25.049845,121.571885) 15 60 39.933 49
4
2014-
12-08
19:00:00
(25.049845,121.571885) 15 60 26.400 29
5
2014-
12-08
20:00:00
(25.049845,121.571885) 15 60 30.000 31
6
2014-
12-08
21:00:00
(25.049845,121.571885) 15 60 32.667 34
7
2014-
12-08
22:00:00
(25.049845,121.571885) 15 60 35.133 39
8
2014-
12-08
23:00:00
(25.049845,121.571885) 15 60 35.000 39
9
2014-
12-09
00:00:00
(25.049845,121.571885) 15 60 28.643 33

In [23]:
df[:3] # :3, :10, 5:10, 'name'
In [24]:
df[['when_ts','avg_bike_num']]
Out[23]:
0
2014-
12-08
15:00:00
(25.049845,121.571885) 15 60 38.667 39
1
2014-
12-08
16:00:00
(25.049845,121.571885) 15 60 40.867 44
2
2014-
12-08
17:00:00
(25.049845,121.571885) 15 60 44.000 48
Out[24]:
when_ts avg_bike_num
0 2014-12-08 15:00:00 38.667
1 2014-12-08 16:00:00 40.867
2 2014-12-08 17:00:00 44.000
3 2014-12-08 18:00:00 39.933
4 2014-12-08 19:00:00 26.400
5 2014-12-08 20:00:00 30.000
6 2014-12-08 21:00:00 32.667
7 2014-12-08 22:00:00 35.133
8 2014-12-08 23:00:00 35.000
9 2014-12-09 00:00:00 28.643

In [25]:
df[['when_ts','avg_bike_num']].plot()
In [26]:
from matplotlib.font_manager import FontProperties, findfont
fp = FontProperties(family='monospace',
style='normal',
variant='normal',
weight='normal',
stretch='normal',
size='medium')
font = findfont(fp)
In [27]:
df[['when_ts','avg_bike_num']].plot()
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3755f87110>
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f37560d2050>

In [28]:
df[['when_ts','avg_bike_num']].plot(kind='kde')
In [16]:
# query db
sql = """
select * from ubike where name = ' 'and (when_ts BETWEEN '2014-12-08' AND
'2014-12-09')
order by when_ts;
"""
ponit2_df = pandas.io.sql.read_sql(sql, conn)
len(ponit2_df)
In [17]:
ponit2_df[:1]
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3756096910>
Out[16]:
10
Out[17]:
0
2014-
12-08
15:00:00
(25.048268,121.552278) 18 38 13.5 15

In [18]:
df['avg_bike_num'].plot()
ponit2_df['avg_bike_num'].plot()
In [19]:
ponit2_df.plot(x='when_ts', y='avg_bike_num')
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba9e2b9490>
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba9e167290>

In [20]:
ponit2_df.plot(x='when_ts', y='avg_bike_num');
df.plot(x='when_ts', y='avg_bike_num')
In [21]:
df[['max_space_num','min_space_num']].plot(kind='area');
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba9dfa1810>

In [22]:
df[['space_num','max_space_num','min_space_num']].plot(kind='area', stacked=Fa
lse);
Table
In [112]:
fig, ax = plt.subplots(1, 1)
ax.get_xaxis().set_visible(False)
df[['space_num','max_space_num','min_space_num']].plot(table=True, ax=ax)
Out[112]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba80d96490>

In [121]:
from pandas.tools.plotting import table
table(ax, np.round(df[['space_num','max_space_num','min_space_num']].describe(
), 2),
loc='upper left', colWidths=[0.1, 0.1, 0.1])
df[['space_num','max_space_num','min_space_num']].plot(table=True, ax=ax)
sql
In [124]:
# query db
sql = """
select a.when_ts as time ,
a.avg_bike_num as point_A,
b.avg_bike_num as point_B
from ubike a
inner join ubike b on
a.when_ts = b.when_ts
and (a.when_ts BETWEEN '2014-12-08' AND '2014-12-09')
and (a.name = ' ' and b.name = ' ');
"""
PointJoin = pandas.io.sql.read_sql(sql, conn)
len(PointJoin)
Out[121]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba80348ad0>
Out[124]:
10

In [126]:
PointJoin
In [127]:
ax.get_xaxis().set_visible(False)
PointJoin[['point_a','point_b']].plot(table=True, ax=ax)
Bar
In [ ]:
## Now you try loc='upper left' colWidths=[0.1, 0.1, 0.1] remove tabl
e=True
Out[126]:
time point_a point_b
0 2014-12-08 15:00:00 13.500 38.667
1 2014-12-08 16:00:00 12.733 40.867
2 2014-12-08 17:00:00 9.615 44.000
3 2014-12-08 18:00:00 12.267 39.933
4 2014-12-08 19:00:00 13.933 26.400
5 2014-12-08 20:00:00 17.200 30.000
6 2014-12-08 21:00:00 10.667 32.667
7 2014-12-08 22:00:00 4.400 35.133
8 2014-12-08 23:00:00 3.867 35.000
9 2014-12-09 00:00:00 0.929 28.643
Out[127]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba8000e150>

In [23]:
ponit2_df.plot(kind='bar');
In [25]:
ponit2_df.plot(kind='bar',x='when_ts', y='avg_bike_num');

In [26]:
ponit2_df[['max_space_num','min_space_num']].plot(kind='bar');
In [27]:
ponit2_df[['max_space_num','min_space_num']].plot(kind='bar',alpha=0.5,stacked
=True);
In [28]:
ponit2_df[['max_space_num','min_space_num']].plot(kind='barh', stacked=True);

In [29]:
ponit2_df[['max_space_num','min_space_num']].plot(kind='hist')
In [30]:
ponit2_df[['max_space_num','min_space_num']].hist()
In [31]:
ponit2_df[:1]
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba9dab6c90>
Out[30]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fba9da
71d90>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fba9da
ece90>]], dtype=object)
Out[31]:
0
2014-
12-08
15:00:00
(25.048268,121.552278) 18 38 13.5 15

In [32]:
# query db
sql = """
select name from ubike where area_name like '% %' group by name
"""
pointA = pandas.io.sql.read_sql(sql, conn)
len(pointA)
In [33]:
pointA
Out[32]:
14
Out[33]:
name
0
1
2 (2 )
3
4
5 (2 )
6
7
8
9
10
11 (2 )
12
13

In [90]:
# query db
sql = """
select * from ubike where name = ' ' and (when_ts BETWEEN '2014-12
-25' AND '2014-12-31')
"""
pointA = pandas.io.sql.read_sql(sql, conn)
len(pointA)
In [35]:
pointA[:1]
In [91]:
# query db
sql = """
select * from tpweather where name = ' ' and (when_ts BETWEEN '2014-1
2-25' AND '2014-12-31');
"""
weaterA = pandas.io.sql.read_sql(sql, conn)
len(weaterA)
In [37]:
weaterA[:1]
Out[90]:
145
Out[35]:
0
2014-
12-08
15:00:00
(25.116325,121.534136) 123 44 31 33
Out[91]:
145
Out[37]:
when_ts where_pt name temp max_temp min_temp hum_pct pressure win
0
2014-
12-19
(25.1180133,121.5373439) 17.3889 17.4 16.9 76 1022.38 2.7

sub select
In [92]:
# query db
sql = """
select * from tpweather where when_ts in
(select when_ts
from ubike
where name = ' ' order by when_ts )
and name = ' ' and (when_ts BETWEEN '2014-12-25' AND '2014-12-31' ) ord
er by when_ts
"""
weatherA = pandas.io.sql.read_sql(sql, conn)
len(weatherA)
In [93]:
len(pointA)
In [94]:
len(weatherA)
Out[92]:
145
Out[93]:
145
Out[94]:
145

In [95]:
pointA.plot(x='when_ts', y='avg_bike_num')
weatherA.plot(x='when_ts', y='rainfall')
Out[95]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba9c0f5890>

In [96]:
pointA['avg_bike_num'].plot()
weatherA['rainfall'].plot()
inner join
In [ ]:
Out[96]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba977c5e50>

• ——
不了Sam Redwine你
• Software and cathedrals are much the same
– ﬁrst we build them, then we pray. (Sam
Redwine)

Pandas+postgre sql 實作 with code

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to Pandas+postgre sql 實作 with code

Similar to Pandas+postgre sql 實作 with code (20)

More from Tim Hong

More from Tim Hong (16)

Recently uploaded

Recently uploaded (20)

Pandas+postgre sql 實作 with code