import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
判断一个整数是否为质数比较简单,即除了自身和1以外不可被别的数整除。不过根据数学理论证明,不用从2检查到n,到int(sqrt(n))+1即可,可以提高效率。注意返回值为True或False,方便后续的boolean索引。
def is_prime(num):
if num <= 1:
return False
for i in range(2,int(np.sqrt(num))+1):
if num % i == 0:
return False
return True
简单粗暴的方式,从1循环到100,一次判断是否为质数,若是质数,则加到ans上,若不是直接跳过。因为%%timeit会执行1000,所以跑完代码就comment out了。
def prime_sum_iter(n=100):
ans = 0
for i in range(1,n+1):
if is_prime(i):
ans += i
return ans
print prime_sum_iter()
# %%timeit
# 1000 loops, best of 3: 253 µs per loop
利用numpy可以向量化,用更简洁的方式遍历所有的元素。向量化的理解,就本例子而言,循环的思想是每次取一个数,对其判断是否为质数;向量化是取这个数组为变量,直接对其所有元素判断是否为质数,然后返回一个同size的数组。由于is_prime()函数本身接受单个integer,如要接受向量、数组等变量,需要对函数进行向量话,is_prime_vec = np.vectorize(is_prime)。
np.vectorize: Define a vectorized function which takes a nested sequence of objects or numpy arrays as inputs and returns a numpy array as output,具体可参考文档。
is_prime_vec(np_arr)返回一个布尔型数组,比如np_arr = array([1,2,3,4]);那is_prime_vec(np_arr)返回array([False, True, True, False]),因为2,3是质数,1,4不是。np_arr[is_prime_vec(np_arr)]是布尔索引,简单讲就是返回对应True的元素,这里会返回array([2,3]),因为2,3对应的boolean值为True。之后再sum就实现了和循环一样的功能。
def prime_sum_vect(n=100):
np_arr = np.arange(1,n+1)
is_prime_vec = np.vectorize(is_prime)
return np.sum(np_arr[is_prime_vec(np_arr)])
print prime_sum_vect()
# %%timeit
# 1000 loops, best of 3: 286 µs per loop
我们用坐标表示醉汉的位置,每次产生两个随机数,一个是步长,就是醉汉一步走多远,我们假设1或2,r = np.random.randint(1,3,N),一个是方向,是一个度数,0-360,theta = np.radians(np.random.randint(0,361,N)),注意转化为弧度,每走一步,坐标值就是之前坐标值之和,用cumsum函数可以很方便地实现,x = np.cumsum(r*np.cos(theta)),y = np.cumsum(r*np.sin(theta))。然后ok了,plot一下看看醉汉的步伐有多醉人。
N = 500 # steps
r = np.random.randint(1,3,N) # move 1 or 2 units of distance each step
theta = np.radians(np.random.randint(0,361,N))
# np.cumsum Return the cumulative sum of the elements along a given axis.
x = np.cumsum(r*np.cos(theta))
y = np.cumsum(r*np.sin(theta))
plt.plot(x,y);
we can partition the integration interval $[a,b]$ into smaller subintervals, and approximate the area under the curve for each subinterval by the area of the trapezoid created by linearly interpolating between the two function values at each end of the subinterval: <img src="http://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Trapezoidal_rule_illustration.png/316px-Trapezoidal_rule_illustration.png" /img> The blue line represents the function $f(x)$ and the red line is the linear interpolation. By subdividing the interval $[a,b]$, the area under $f(x)$ can thus be approximated as the sum of the areas of all the resulting trapezoids.
If we denote by $x_{i}$ ($i=0,\ldots,n,$ with $x_{0}=a$ and $x_{n}=b$) the abscissas where the function is sampled, then
$$ \int_{a}^{b}f(x)dx\approx\frac{1}{2}\sum_{i=1}^{n}\left(x_{i}-x_{i-1}\right)\left(f(x_{i})+f(x_{i-1})\right). $$The common case of using equally spaced abscissas with spacing $h=(b-a)/n$ reads simply
$$ \int_{a}^{b}f(x)dx\approx\frac{h}{2}\sum_{i=1}^{n}\left(f(x_{i})+f(x_{i-1})\right). $$具体计算只需要这个公式
道理很简单,就是把积分区间分割为很多小块,用梯形替代,其实还是局部用直线近似代替曲线的思想。这里对此一元二次函数积分,并与python模块积分制对比(精确值为4.5)用以验证。
$$
\int_a^b (x^2 - 3x + 2) dx
$$
from scipy import integrate
import numpy as np
def f(x):
return x*x - 3*x + 2
def trape(f,a,b,n=100):
f = np.vectorize(f) # can apply on vector
h = float(b - a)/n
arr = f(np.linspace(a,b,n+1))
return (h/2.)*(2*arr.sum() - arr[0] - arr[-1])
def quad(f,a,b):
return integrate.quad(f,a,b)[0] # compare with python library result
a, b = -1, 2
print (trape(f,a,b))
print (quad(f,a,b))
模拟生命类似一个小游戏,可以假设有很多个小生命,或小细胞,可生可灭,具体k看这个细胞邻居的多少,规则如下,更多参见:
The universe of the Game of Life is an infinite two-dimensional orthogonal grid of square cells, each of which is in one of two possible states, live or dead. Every cell interacts with its eight neighbours, which are the cells that are directly horizontally, vertically, or diagonally adjacent. At each step in time, the following transitions occur:
- Any live cell with fewer than two live neighbours dies, as if by needs caused by underpopulation.
- Any live cell with more than three live neighbours dies, as if by overcrowding.
- Any live cell with two or three live neighbours lives, unchanged, to the next generation.
- Any dead cell with exactly three live neighbours becomes a live cell.
The initial pattern constitutes the 'seed' of the system. The first generation is created by applying the above rules simultaneously to every cell in the seed – births and deaths happen simultaneously, and the discrete moment at which this happens is sometimes called a tick. (In other words, each generation is a pure function of the one before.) The rules continue to be applied repeatedly to create further generations.
目标就是根据这些规则,确定经过若干次演变后,生命的形态,哪些细胞生,哪些细胞灭。例如:
Z = np.array([[0,0,0,0,0,0],
[0,0,0,1,0,0],
[0,1,0,1,0,0],
[0,0,1,1,0,0],
[0,0,0,0,0,0],
[0,0,0,0,0,0]])
演变后就是
array([[0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 1, 0],
[0, 0, 1, 1, 1, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]])
算法的关键是确定每个细胞的邻居多少,以决定生死,且假定边界均死,用0表示。
def evolve(Z):
N = (Z[0:-2,0:-2] + Z[0:-2,1:-1] + Z[0:-2,2:] +
Z[1:-1,0:-2] + Z[1:-1,2:] +
Z[2: ,0:-2] + Z[2: ,1:-1] + Z[2: ,2:])
# Apply rules
birth = (N==3) & (Z[1:-1,1:-1]==0) #若本来死,且邻居为3,则复活
survive = ((N==2) | (N==3)) & (Z[1:-1,1:-1]==1) # 若本来活,且邻居为2 or 3,则幸存
Z[...] = 0 # 重置为0
Z[1:-1,1:-1][birth | survive] = 1 # 复活或幸存的置为1
return Z
Z = np.random.randint(0,2,(256,512))
for i in range(100): evolve(Z)
# plot
size = np.array(Z.shape)
dpi = 72.0
figsize= size[1]/float(dpi),size[0]/float(dpi)
fig = plt.figure(figsize=figsize, dpi=dpi, facecolor="white")
fig.add_axes([0.0, 0.0, 1.0, 1.0], frameon=False)
plt.imshow(Z,interpolation='nearest', cmap=plt.cm.hot)
plt.xticks([]), plt.yticks([])
plt.show()
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import *
from scipy.stats import norm
sns.set_style("darkgrid") #利用seaborn的style更漂亮一些
def random_walk(m,n,N=200):
fig = plt.figure(figsize=(12,5))
for i in range(1,m*n+1):
r = np.random.randint(1,3,N)
theta = np.radians(np.random.randint(0,361,N))
x = np.cumsum(r*np.cos(theta))
y = np.cumsum(r*np.sin(theta))
plt.subplot(m,n,i) #切换到特定的subplot
plt.plot(x,y)
plt.xticks([]), plt.yticks([]) # 去掉ticks
plt.show()
random_walk(3,4)
def f(x):
return x*x - 3*x
a,b = -1,3
x = np.linspace(a,b,100,endpoint=True)
# 基本绘图
plt.figure(figsize=(10,8))
plt.plot(x,f(x),color='black',alpha=.1)
# 填充积分区域fill_between
plt.axhline(0,color='gray')
plt.fill_between(x,0,f(x),f(x)>0,color='blue',alpha=.15)
plt.fill_between(x,0,f(x),f(x)<0,color='blue',alpha=.05)
n = 7 #为凸显梯形和原来曲线的区别,把分割分数降低
# 画梯形
for i in range(n):
x = np.linspace(a,b,n,endpoint=True)
# 画垂直的线
plt.plot([x[i],x[i]],[0,f(x[i])],color='black',linestyle='--')
# 画特定的点
plt.scatter(x[i],f(x[i]),20,color='black')
if i==0: continue
# 画梯形最后的斜边
plt.plot([x[i-1],x[i]],[f(x[i-1]),f(x[i])],color='black',linestyle='--')
ax = plt.gca()
# 注释
ax.annotate(r'$f(x)=x^2 - 3x$', xy=(2, -2), xytext=(2.5, -2),arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-.2"))
# 坐标轴平移
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_position(('data',0))
ax.spines['left'].set_position(('data',0))
# 旋转label
for label in ax.get_xticklabels() + ax.get_yticklabels():
label.set_rotation(45)
# 设置坐标轴的limit
plt.xlim([a-.1,b+.1]),plt.xticks([-1,0,1,2,3])
plt.ylim([-2.3,4.1]),plt.yticks([-2,0,2,4])
plt.show()
基本想法很简单,首先写一个你感兴趣的函数,然后指定想要交互参数就行。直接利用interact函数调用相应的函数,然后就会有交互效果,比如一个slider之类的。更多参考。
# 画一个正态分布曲线,调节其两个参数,看概率密度函数和累计分布函数
def f(mean,std):
x = np.arange(-10, 10, 0.1)
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
axes[0].plot(x,norm.pdf(x,0,1))
axes[0].plot(x,norm.pdf(x,mean,std))
axes[1].plot(x,norm.cdf(x,0,1))
axes[1].plot(x,norm.cdf(x,mean,std))
# fig, axes = plt.subplots(2, 2, figsize=(10, 5))
# axes[0][0].plot(x,norm.pdf(x,0,1))
# axes[0][1].plot(x,norm.cdf(x,0,1))
# axes[1][0].plot(x,norm.pdf(x,mean,std))
# axes[1][1].plot(x,norm.cdf(x,mean,std))
interact(f,mean=(-7.,7,0.5),std=(0.0,5));
import pandas as pd
import numpy as np
from pandas_datareader import data, wb # 需要安装 pip install pandas_datareader
import datetime
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
# 定义获取数据的时间段
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2016,5,20)
sh = data.DataReader("000001.SS", 'yahoo', start, end)
sh.head(3) # 数据获取成功
sh.describe() # 数据整体概览
sh['Close'].plot(); #看一下收盘趋势
pd.read_csv and to_csv¶从文件读取数据是非常常见的操作
sh.to_csv('sh.csv',header=None)
names = ['Date','Open','High','Low','Close','Volume','Adj Close']
sh1 = pd.read_csv('sh.csv',names=names,index_col='Date')
sh1.tail(2)
# drop Volume列,所以数据都为0
sh = sh.drop('Volume',axis=1)
sh.head(2) # Volume列消失了
有些时候会有数据缺失,不完整,需要查看一下。
# Detect missing values,用isnull函数
pd.isnull(sh).head() # or sh.isnull()
但返回的是一个boolean型的DataFrame,我只想先看一下是否有缺失值,不用肉眼。
利用any()函数,如果有可以用sum()函数查看有多少。
#参考 http://stackoverflow.com/questions/29530232/python-pandas-check-if-any-value-is-nan-in-dataframe
sh.isnull().values.any() # 数据很完整
sh.isnull().values.sum() # 没有值
sh.head()
这里把一些位置设置为np.nan,然后填充,没有实际意义,这里只是拿来练手。
设置nan如下:
sh.iloc[0,:] = np.nan
sh.iloc[[1,3],1] = np.nan
sh.iloc[2,2] = np.nan
sh.iloc[3,3] = np.nan
sh.head(4)
# check NaN number per column
def num_missing(x):
return x.isnull().sum()
print "Missing values count per column:"
print sh.apply(num_missing, axis=0)
dropna
有几个参数
how='all'只有全部为NaN的行才drop,若axis=1则对列;how='any'默认,则drop所有含NaN的行或列;inplacce=True则inplace操作,不返回;inplace=False,返回一个drop后的,不改变原DataFramesh.dropna(how='all',inplace=True);
sh.head(3)
填充fillna
sh['High'].fillna(sh['High'].mean(),inplace=True) #fill in with mean
sh['Low'].fillna(method='ffill',inplace=True) #fill in with value from previous row
sh['Close'].fillna(method='bfill',inplace=True) # fill in with value from the row behind
sh.head()
sh.isnull().values.sum()
涨跌额是指当日股票价格与前一日收盘价格相比的涨跌数值。添加一列change,其为当日close价格与之前一天的差值。当然注意这里数据有缺失,有的日期没有记录。
change = sh.Close.diff()
change.fillna(change.mean(),inplace=True)
sh['Change'] = change
sh.head(3)
# 按涨跌额排序,未改变sh,若需要,可以`inplace=True`
sh.sort_values(by='Change',ascending=False).head(3)
sh.head()
# 用shift方法错位
# sh['pct_change'] = ((sh['Change'] - sh['Change'].shift(1)) / sh['Change'])
# 或用pct_Change函数
sh['pct_change'] = sh.Change.pct_change()
sh.iloc[5:9]
利用applymap格式化数据,均保留两位小数。这是对其进行element-wise操作
sh.head(2)
format = lambda x: '%.2f' % x
sh = sh.applymap(format)
sh.head(2)
sh = sh.applymap(float) # 可以重新把数据类型转换为float类型
sh.ix[1]
利用apply找出每列最值及变化范围
def f(x):
return pd.Series([x.min(),x.max(),x.max()-x.min()],index=['min','max','range'])
sh.apply(f)
# 按月分组
sh['group_index'] = sh.index.map(lambda x: 100*x.year + x.month)
sh.head(2).append(sh.tail(2)) # 查看group_index, as expected
sh.groupby('group_index') # 返回一个groupby对象
# 对不同的列运用不同的函数聚合
def peak_to_peak(arr):
return arr.max() - arr.min()
grouped = sh.groupby('group_index').agg({
'Open' : peak_to_peak,
'High' : 'max',
'Low' : 'min',
'Change' : 'mean'
})
grouped.head(3).append(grouped.tail(3))
sh.index # DatetimeIndex 提供很多便捷操作
grouped_1 = sh.groupby(pd.TimeGrouper("M")).agg({
'Open' : peak_to_peak,
'High' : 'max',
'Low' : 'min',
'Change' : 'mean'
})
grouped_1.head(3).append(grouped_1.tail(3))
可以downsampling,也可以upsampling
# resample('M',how='mean')这样的语法将来不支持,推荐.mean()以及.apply()
# 可以传入词典给apply
grouped_2 = sh[['Open','High','Low','Change']].resample('M').apply({
'Open' : peak_to_peak,
'High' : 'max',
'Low' : 'min',
'Change' : 'mean'
})
grouped_2.head(3).append(grouped_2.tail(3))
sh[['Open']].plot();
plt.show()
import pandas as pd
import requests
import numpy as np
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
%matplotlib inline
html_1 = requests.get('http://zh.wikipedia.org/wiki/%E7%9C%81%E4%BC%9A')
html_1_text = html_1.text
dfs_1 = pd.read_html(html_1_text,header=0,attrs={'class': 'wikitable'})
dfs_1[0].head()
read_html依赖一些库,比如html5lib,lxml,beautiful soup等,如果没有安装会报错。
url = 'http://data.earthquake.cn/datashare/globeEarthquake_csn.html'
html = requests.get(url)
request默认是ISO-8859-1编码,这里会导致乱码
html.encoding
html_text = html.text
dfs = pd.read_html(html_text,header=0)
dfs[4].head()
html.encoding = html.apparent_encoding
html.encoding #竟然是GB2312,一开始窃以为是UTF-8...
html_text = html.text
dfs = pd.read_html(html_text,header=0) # 返回的是一个list,list里是表格
dfs[4].head()
df = dfs[4]
df.columns = ['date','time','lats','lons','dept','mag','class','place']
df.head(3)
不太懂地震,除了了解一下分布,不知道还分析什么,就先画个分布图吧。
把以Ms震级开头的行去掉(共7个),只保留ML开头的,便于分析
Ms = df.mag.map(lambda x: not x.startswith('Ms')) # boolean Series
df = df[Ms]
把震级转换为数字
import re
get_num = lambda x: float(re.findall('\d+\.\d+', x)[0])
# a = 'ML0.5'
# n = re.findall('\d+\.\d+', a)
# float(n[0])
# df['mag_num'] = df['mag'].map(get_num) 会报错
# http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
temp = df['mag'].map(get_num)
df.loc[:,('mag_num')] = temp
df.head(3)
# lower left
llcrnrlon, llcrnrlat = df['lons'].min(), df['lats'].min()
# upper right
urcrnrlon, urcrnrlat = df['lons'].max(), df['lats'].max()
lons, lats = list(df['lons']), list(df['lats'])
mags = list(df['mag_num'])
不同的等级用不同的颜色
# 按等级配色
def get_marker_color(magnitude):
# Returns green for small earthquakes, yellow for moderate
# earthquakes, and red for significant earthquakes.
if magnitude < 1.5:
return ('go')
elif magnitude < 3.0:
return ('yo')
else:
return ('ro')
fig = plt.figure(figsize=(14,10))
ax = plt.subplot(1,1,1)
eq_map = Basemap(projection='merc', resolution = 'l', area_thresh = 1000.0,
lat_0=0, lon_0=120,
llcrnrlon=llcrnrlon-5, llcrnrlat=llcrnrlat-8,
urcrnrlon=urcrnrlon+10, urcrnrlat=urcrnrlat+3)
eq_map.drawcoastlines()
eq_map.drawcountries()
eq_map.fillcontinents(color = 'lightgray')
eq_map.drawmapboundary()
eq_map.drawmeridians(np.arange(0, 360, 30))
eq_map.drawparallels(np.arange(-90, 90, 30))
min_marker_size = 4
for lon, lat, mag in zip(lons,lats,mags):
x,y = eq_map(lon, lat)
msize = mag*min_marker_size
marker_string = get_marker_color(mag)
eq_map.plot(x, y, marker_string, markersize=msize)
plt.show()
可以看到最近一段时间地震在全国范围内的分布。
主要联系pandas的基本操作
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
# 数据初探
!head -n 4 qqdata.csv
!wc -l qqdata.csv #数据很小,才一万多行,直接读
直接读取的时间列是str类型,如果解析成时间类型,分析更方便。
# 默认的parse_dates = True不能有效地解析
# http://stackoverflow.com/questions/17465045/can-pandas-automatically-recognize-dates
dateparse = lambda x: pd.datetime.strptime(x, '%Y/%m/%d %H:%M:%S')
qq = pd.read_csv('qqdata.csv', parse_dates=['time'], date_parser=dateparse)
qq['time'][0] # 时间戳类型,而不是str
qq.head(3)
群聊人数
len(qq['id'].unique()) #群人数144人
时间戳是否唯一?有重复,暂时先不把其设为index
time = qq['time']
len(time) == len(time.unique())
把话唠定义为发言次数最多的人
qq['count'] = 1 #添加一列
# 因为qq['count']设置为1,所以count()也可以替换为sum()
gp_by_id = qq['count'].groupby(qq['id']).count().sort_values(ascending=False)
type(gp_by_id) #返回一个Series
gp_by_id[:5]
plt.figure(figsize=(6,4))
ax = gp_by_id[:10].plot.bar()
ax.set_xticklabels(gp_by_id.index,rotation='45')
ax.set_xlabel('');
发现一个很怪的id: )chailed (104: Connection reset by pee,确认一下是不是在.
(qq['id'] == ')chailed (104: Connection reset by pee').any()
而且万年潜水党,只说过一句话,名字还这么难记,内个群主,踢人了可以。
gp_by_id.ix[')chailed (104: Connection reset by pee']
看看大家聊天主要集中在周几
# 添加一列 weekday, derived from time
qq['weekday'] = qq['time'].map(lambda x : x.weekday())
gp_by_weekday = qq['count'].groupby(qq['weekday']).count()
gp_by_weekday.plot.bar(); # 不好好上班,平时话真多
# 添加一列 hour, derived from time
qq['hour'] = qq['time'].map(lambda x : x.hour)
gp_by_hour = qq['count'].groupby(qq['hour']).count()
gp_by_hour.plot.bar();
# 添加一列 day, derived from time
qq['day'] = qq['time'].map(lambda x : x.date())
gp_by_day = qq['count'].groupby(qq['day']).count()
ax = gp_by_day.plot.bar();
ax.set_xticks([])
ax.set_xticklabels([]);
如果某天说话了,则定义为这一天活跃。
# qq.groupby('id') group by id
# .day we only interest in active day now
# .nunique() the number of unique active day
# 等价于 apply(lambda x: len(x.unique()))
gp_by_act_day = qq.groupby('id').day.nunique().sort_values(ascending=False)
plt.figure(figsize=(6,4))
ax = gp_by_act_day[:10].plot.bar()
ax.set_xticklabels(gp_by_act_day.index,rotation='45')
ax.set_xlabel('');
观察是否发言人数多,相应的发言量也增加了
# 活跃用户数
people = qq['id'].groupby(qq['day']).nunique()
# 发言量
speech = qq['count'].groupby(qq['day']).count()
# 可以看出正相关
plt.figure(figsize=(6,4))
ax = plt.scatter(people,speech)