forked from mmistakes/minimal-mistakes
-
Notifications
You must be signed in to change notification settings - Fork 7
python
Wang Cheng-Jun edited this page Dec 19, 2016
·
1 revision
https://github.com/solarlee/Awesome-Python-Toolbox
- Numerical Python http://nbviewer.jupyter.org/github/jrjohansson/numerical-python-book-code/blob/master/ch13-code-listing.ipynb
- An Introduction to Statistics with Python http://nbviewer.jupyter.org/github/data-journalism/statsintro_python/tree/master/ipynb/
有时为了可以重复,我们需要这样设置:
import random l = [1,2,3,4,5] r = random.Random(500) # seed number is arbitraryrandom.Random(500).choice(l), random.Random(500).choice(l)
- r.choice(l), r.choice(l)
def log_binning(x, y, bin_count=50):
max_x = np.log10(max(x))
max_y = np.log10(max(y))
max_base = max([max_x,max_y])
xx = [i for i in x if i>0]
min_x = np.log10(np.min(xx))
bins = np.logspace(min_x,max_base,num=bin_count)
# Based on: http://stackoverflow.com/questions/6163334/binning-data-in-python-with-scipy-numpy
bin_means_y = (np.histogram(x,bins,weights=y)[0] / np.histogram(x,bins)[0])
bin_means_x = (np.histogram(x,bins,weights=x)[0] / np.histogram(x,bins)[0])
return bin_means_x,bin_means_y
一般而言使用hist命令即可,当我们想要先对数据取log,然后使用hist命令的时候,需要如下调整。
# The most direct way is to just compute the log10 of the limits, compute linearly spaced bins, and then convert back by raising to the power of 10, as below:import pylab as pl import numpy as np data = np.random.normal(size=10000) MIN, MAX = .01, 10.0 pl.figure() pl.hist(data, bins = 10 ** np.linspace(np.log10(MIN), np.log10(MAX), 50)) pl.gca().set_xscale("log") pl.show()
- http://stackoverflow.com/questions/6855710/how-to-have-logarithmic-bins-in-a-python-histogram
import pandas as pd
data = {'xi':range(1, 12), 'y':[3, 5, 9, 12, 18, 22, 28, 35, 49, 60,65]}
df = pd.DataFrame(data)
Pandas中的GroupBy操作效率非常高,可以使用.get_group()方法获得分组后的每组数据的内容,例如从7000万条通话记录中寻找其中14万个用户的基站序列,可以先对用户进行GroupBy,即可获取他们的基站序列。
In [16]: df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
In [17]: df3.groupby(['X']).get_group('A')
Out[17]:
X Y
0 A 1
2 A 3
In [18]: df3.groupby(['X']).get_group('B')
Out[18]:
X Y
1 B 4
3 B 2
- 获取每个用户的基站序列
dfgroupbyobj = call_data_selected.groupby('calling_nbr')
node_list = dfgroupbyobj.get_group(i)['calling_cell'].tolist()
sort a dict
webt = sorted(web.iteritems(), key=lambda (k,v): (-v,k)) sorted(data,key=lambda x:-x) sorted(d.items(), key=lambda x: x[1]) sorted(data, reverse = True )
how to return index of a sorted list?
>>> s = [2, 3, 1, 4, 5] >>> sorted(range(len(s)), key=lambda k: s[k]) [2, 0, 1, 3, 4] >>>
http://file.allitebooks.com/20160505/Matplotlib%20for%20Python%20Developers.pdf
Matplotlib for Python Developers
强烈推荐此书
参见:https://networkx.github.io/documentation/latest/reference/
import matplotlib.pyplot as plt %matplotlib inline plt.figure(figsize=(10,10)) pos=nx.spring_layout(G) nx.draw_networkx_nodes(G,pos,node_size=20, node_color='b') nx.draw_networkx_labels(G,pos,fontsize=5) nx.draw_networkx_edges(G,pos,edge_color='k') edge_labels = nx.get_edge_attributes(G,'times') nx.draw_networkx_edge_labels(G, pos, labels = edge_labels, label_pos=0.5) plt.show()
如图
from jieba import cut
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
txt = ["我来到北京清华大学",
"他来到了网易杭研大厦",
"小明硕士毕业与中国科学院",
"我爱北京天安门"]
corpus =[r' '.join(cut(i, cut_all=False)) for i in txt ]
vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
weight=tfidf.toarray()
for i in word:
print i
weight
https://github.com/whtsky/WeRoBot
import powerlaw
def plotPowerlaw(data,ax,col,xlab):
fit = powerlaw.Fit(data,xmin=1)
fit.plot_pdf(color = col, linewidth = 2)
fit = powerlaw.Fit(data)
fit.power_law.plot_pdf(color = col, linestyle = 'dotted', ax = ax)
a,x = (fit.power_law.alpha,fit.power_law.xmin)
minx,maxx=ax.get_xlim()
miny,maxy=ax.get_ylim()
ax.text(minx*5,maxy/10,r"$\alpha = %d \:\:, x_{min} = %d$" % (a,x), fontsize = 20)
ax.set_xlabel(xlab, fontsize = 20)
ax.set_ylabel('$P(k)$', fontsize = 20)
def plotDegreeDistribution(G):
# G is a networkx object
degs = defaultdict(int)
for i in G.degree().values(): degs[i]+=1
items = sorted ( degs.items () )
x, y = np.array(items).T
x, y = np.array(items).T
y_sum = np.sum(y)
y = y/float(y_sum)
plt.plot(x, y, 'b-o')
plt.xscale('log')
plt.yscale('log')
plt.legend(['Degree'])
plt.xlabel('$k$', fontsize = 20)
plt.ylabel('$P(k)$', fontsize = 20)
import statsmodels.api as sm
import numpy as np
x = np.log(range(1,len(degree_list)+1))
y = np.log([d+1 for d in degree_list])
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(range(len(degree_list)), degree_list, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$', fontsize = 20)
plt.ylabel(r'$Replies$', fontsize = 20)
plt.text(max(range(len(degree_list)))/300,max(degree_list)/20,
r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()
print beta, r2
# 将实时结果记录到文件中
for k, i in enumerate(user_list):
node_entropy = entropy(node_list)
entropy_list.append(node_entropy)
if k % 100 ==0:
log_file = open('log_file.log', 'w+')
log_file.write(str(k))
log_file.write('\t')
log_file.write(str(float(k)/nums))
log_file.write('\t')
log_file.write(str(node_entropy))
log_file.write('\n')
log_file.flush()
- 将实时结果和进度输出都屏幕上
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()