User Tools

Site Tools


chapter2

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
chapter2 [2014-04-23 12:32]
so_nakagawa [2.1.py]
chapter2 [2014-04-23 12:51] (current)
so_nakagawa [2.3.py]
Line 127: Line 127:
  
 ===== 2.2.py ==== ===== 2.2.py ====
-#​!/​usr/​bin/​python +  ​#​!/​usr/​bin/​python 
-import pandas as pd+  import pandas as pd 
 +   
 +  ##p28 
 +  inpath = "​{}Dropbox/​bmi/​reading_circle/​pydata-book-master/​ch02/​movielens/"​.format('/​Users/​so/'​) 
 +   
 +  unames = ['​user_id',​ '​gender',​ '​age',​ '​occupation',​ '​zip'​] 
 +  users = pd.read_table(inpath+"​users.dat",​ sep='::',​ header=None,​ names=unames) 
 +   
 +  rnames = ['​user_id',​ '​movie_id',​ '​rating',​ '​timestamp'​] 
 +  ratings = pd.read_table(inpath+'​ratings.dat',​ sep='::',​ header=None,​ names=rnames) 
 +   
 +  mnames = ['​movie_id',​ '​title',​ '​genres'​] 
 +  movies = pd.read_table(inpath+'​movies.dat',​ sep='::',​ header=None,​ names=mnames) 
 +   
 +  #print users[:5] 
 +   
 +  #print ratings[:​5] 
 +   
 +  ##p29 
 +  #print movies[:​5] 
 +   
 +  #print ratings 
 +   
 +  data = pd.merge(pd.merge(ratings,​ users), movies) 
 +  #print data 
 +   
 +  ##p30 
 +  mean_ratings = data.pivot_table('​rating',​ rows='​title',​ cols='​gender',​ aggfunc='​mean'​) 
 +  #print mean_ratings[:​5] 
 +   
 +  ##p31 
 +  ratings_by_title = data.groupby('​title'​).size() 
 +  #print ratings_by_title[:​10] 
 +   
 +  active_titles = ratings_by_title.index[ratings_by_title >= 250] 
 +  #print active_titles 
 +   
 +  mean_ratings = mean_ratings.ix[active_titles] 
 +  #print mean_ratings 
 +   
 +  ##p32 
 +  top_female_ratings = mean_ratings.sort_index(by='​F',​ ascending=False) 
 +  #print top_female_ratings[:​10] 
 +   
 +  mean_ratings['​diff'​] = mean_ratings['​M'​] - mean_ratings['​F'​] 
 +  sorted_by_diff = mean_ratings.sort_index(by='​diff'​) 
 +  #print sorted_by_diff[:​15] 
 +   
 +  #print sorted_by_diff[::​-1][:​15] 
 +   
 +  rating_std_by_title = data.groupby('​title'​)['​rating'​].std() 
 +  rating_std_by_title = rating_std_by_title.ix[active_titles] 
 +  print rating_std_by_title.order(ascending=False)[:​10]
  
-##p28 +===== 2.3.py ==== 
-inpath = "{}Dropbox/​bmi/​reading_circle/​pydata-book-master/​ch02/​movielens/".format('/Users/so/')+  #​!/​usr/​bin/​python 
 +  import pandas as pd 
 +  import numpy as np 
 +  import matplotlib.pyplot as plt 
 +   
 +  ​##p34 
 +  inpath = "/Users/so/Dropbox/​bmi/​reading_circle/​pydata-book-master/​ch02/​
 +   
 +  #? unclear 
 +  #names = inpath + "​ch02/​names/" 
 +  #print names.head(10) 
 +   
 +  ##p35 
 +  names1880 = pd.read_csv(inpath+'names/yob1880.txt',​ names=['​name',​ '​sex',​ '​births'​]) 
 +   
 +  #print names1880 
 +   
 +  ##p36 
 +   
 +  #print names1880.groupby('​sex'​).births.sum() 
 +   
 +  years = range(1880, 2011) 
 +   
 +  pieces = [] 
 +  columns = ['​name',​ '​sex',​ '​births'​] 
 +   
 +  for year in years: 
 +    path = inpath+'​names/yob%d.txt'​ % year 
 +    frame = pd.read_csv(path,​ names=columns) 
 +     
 +    frame['​year'​] = year 
 +    pieces.append(frame) 
 +  names = pd.concat(pieces,​ ignore_index=True) 
 +   
 +  #p37 
 +   
 +  #print names 
 +   
 +  total_births = names.pivot_table('​births',​ rows='​year',​ cols='​sex',​ aggfunc=sum) 
 +   
 +  #print total_births.tail() 
 +   
 +  def add_prop(group):​ 
 +    births = group.births.astype(float) 
 +   
 +    group['​prop'​] = births ​births.sum() 
 +     
 +    ##p38 
 +    return group 
 +  names = names.groupby(['year', '​sex'​]).apply(add_prop) 
 +   
 +  #print names 
 +   
 +  ##p39 
 +  #print np.allclose(names.groupby(['​year',​ '​sex'​]).prop.sum(),​ 1) 
 +   
 +  def get_top1000(group):​ 
 +    return group.sort_index(by='​births',​ ascending=False)[:​1000] 
 +  grouped = names.groupby(['​year',​ '​sex'​]) 
 +  top1000 = grouped.apply(get_top1000) 
 +   
 +  #print top1000 
 +   
 +  pieces = [] 
 +  for year, group in names.groupby(['​year',​ '​sex'​]):​ 
 +    pieces.append(group.sort_index(by='​births',​ ascending=False)[:​1000]) 
 +  top1000 = pd.concat(pieces,​ ignore_index=True) 
 +   
 +  #print top1000 
 +   
 +  boys = top1000[top1000.sex == '​M'​] 
 +  girls = top1000[top1000.sex == '​F'​] 
 +   
 +  ##p40 
 +  total_births = top1000.pivot_table('​births',​ rows='​year',​ cols='​name',​ aggfunc=sum) 
 +   
 +  #print total_births 
 +   
 +  subset = total_births[['​John',​ '​Harry',​ '​Mary',​ '​Marilyn'​]] 
 +   
 +  subset.plot(subplots=True,​ figsize=(12,​ 10), grid=False, title="​Number of births per year"​) 
 +  #​plt.show() 
 +   
 +  ##p41 
 +  table = top1000.pivot_table('​prop',​ rows='​year',​ cols='​sex',​ aggfunc=sum) 
 +  table.plot(title='​Sum of table1000.prop by year and sex', yticks=np.linspace(0,​ 1.2, 13), xticks=range(1880,​ 2020, 10)) 
 +  #​plt.show() 
 +   
 +  df = boys[boys.year == 2010] 
 +  #print df 
 +   
 +  #p42 
 +  prop_cumsum = df.sort_index(by='​prop',​ ascending=False).prop.cumsum() 
 +   
 +  #print prop_cumsum[:​10] 
 +   
 +  #ERROR, object type is different 
 +  #print np.searchsorted(prop_cumsum,​ 0.5) 
 +  #print prop_cumsum.searchsorted(0.5) 
 +   
 +  df = boys[boys.year == 1900] 
 +  in1900 = df.sort_index(by='​prop',​ ascending=False).prop.cumsum() 
 +  #again ERROR! 
 +  #print in1900.searchsorted(0.5) + 1 
 +   
 +  #p43 
 +  #def get_quantile_count(group,​ q=0.5): 
 +  #    group = group.sort_index(by='​prop',​ ascending=False) 
 +  #    return group.prop.cumsum().searchsorted(q) + 1 
 +   
 +  #diversity = top1000.groupby(['​year',​ '​sex'​]).apply(get_quantile_count) 
 +  #diversity = diversity.unstack('​sex'​) 
 +   
 +  #​diversity.head() 
 +   
 +  ##p44 
 +  get_last_letter = lambda x: x[-1] 
 +  last_letters = names.name.map(get_last_letter) 
 +  last_letters.name = '​last_letter'​ 
 +   
 +  table = names.pivot_table('​births',​ rows=last_letters,​ cols=['​sex',​ '​year'​],​ aggfunc=sum) 
 +   
 +  #print last_letters.head(5) 
 +   
 +  #print last_letters.tail(5) 
 +   
 +  ##p45 
 +  subtable = table.reindex(columns=[1910,​ 1960, 2010], level='​year'​) 
 +   
 +  #print subtable.head() 
 +   
 +  #print subtable.sum() 
 +   
 +  letter_prop = subtable / subtable.sum().astype(float) 
 +   
 +  fig, axes = plt.subplots(2,​ 1, figsize=(10,​ 8)) 
 +  letter_prop['​M'​].plot(kind='​bar',​ rot=0, ax=axes[0], title='​Male'​) 
 +  letter_prop['​F'​].plot(kind='​bar',​ rot=0, ax=axes[1], title='​Female',​ legend=False) 
 +   
 +  #​plt.show() 
 +   
 +  letter_prop = table / table.sum().astype(float) 
 +  dny_ts = letter_prop.ix[['​d',​ '​n',​ '​y'​],​ '​M'​].T 
 +   
 +  #print dny_ts.head() 
 +   
 +  dny_ts.plot(style={'​d':'​-.',​ '​n':'​-',​ '​y':':'​}) 
 +  #​plt.show() 
 +   
 +  ##p47 
 +  all_names = top1000.name.unique() 
 +  mask = np.array(['​lesl'​ in x.lower() for x in all_names]) 
 +   
 +  lesley_like = all_names[mask] 
 +   
 +  #print lesley_like 
 +   
 +  filtered = top1000[top1000.name.isin(lesley_like)] 
 +  #print filtered.groupby('​name'​).births.sum() 
 +   
 +  ##p48 
 +  #print filtered.head(10) 
 +  #print filtered.tail(10) 
 +   
 +  ##p49 
 +  table = filtered.pivot_table('​births',​ rows='​year',​ cols='​sex',​ aggfunc='​sum'​) 
 +  #print table.head(5) 
 +  #print table.tail(5) 
 +   
 +  table = table.div(table.sum(1),​ axis=0) 
 +   
 +  #print table.tail() 
 +   
 +  table.plot(style={'​M':​ '​k-',​ '​F':​ '​k--'​}) 
 +  #plt.show()
  
-unames = ['​user_id',​ '​gender',​ '​age',​ '​occupation',​ '​zip'​] 
-users = pd.read_table(inpath+"​users.dat",​ sep='::',​ header=None,​ names=unames) 
  
-rnames = ['​user_id',​ '​movie_id',​ '​rating',​ '​timestamp'​] 
-ratings = pd.read_table(inpath+'​ratings.dat',​ sep='::',​ header=None,​ names=rnames) 
- 
-mnames = ['​movie_id',​ '​title',​ '​genres'​] 
-movies = pd.read_table(inpath+'​movies.dat',​ sep='::',​ header=None,​ names=mnames) 
- 
-#print users[:5] 
- 
-#print ratings[:5] 
- 
-#p29 
-#print movies[:5] 
- 
-#print ratings 
- 
-data = pd.merge(pd.merge(ratings,​ users), movies) 
-#print data 
- 
-#p30 
-mean_ratings = data.pivot_table('​rating',​ rows='​title',​ cols='​gender',​ aggfunc='​mean'​) 
-#print mean_ratings[:​5] 
- 
-#p31 
-ratings_by_title = data.groupby('​title'​).size() 
-#print ratings_by_title[:​10] 
- 
-active_titles = ratings_by_title.index[ratings_by_title >= 250] 
-#print active_titles 
- 
-mean_ratings = mean_ratings.ix[active_titles] 
-#print mean_ratings 
- 
-#p32 
-top_female_ratings = mean_ratings.sort_index(by='​F',​ ascending=False) 
-#print top_female_ratings[:​10] 
- 
-mean_ratings['​diff'​] = mean_ratings['​M'​] - mean_ratings['​F'​] 
-sorted_by_diff = mean_ratings.sort_index(by='​diff'​) 
-#print sorted_by_diff[:​15] 
- 
-#print sorted_by_diff[::​-1][:​15] 
- 
-rating_std_by_title = data.groupby('​title'​)['​rating'​].std() 
-rating_std_by_title = rating_std_by_title.ix[active_titles] 
-print rating_std_by_title.order(ascending=False)[:​10] 
chapter2.1398256349.txt.gz · Last modified: 2014-04-23 12:32 by so_nakagawa