User Tools

Site Tools


chapter2

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
chapter2 [2014-04-23 12:40]
so_nakagawa [2.2.py]
chapter2 [2014-04-23 12:51] (current)
so_nakagawa [2.3.py]
Line 183: Line 183:
  
 ===== 2.3.py ==== ===== 2.3.py ====
-#​!/​usr/​bin/​python +  ​#​!/​usr/​bin/​python 
-import pandas as pd +  import pandas as pd 
-import numpy as np +  import numpy as np 
-import matplotlib.pyplot as plt +  import matplotlib.pyplot as plt 
- +   
-#p34 +  ##p34 
-inpath = "/​Users/​so/​Dropbox/​bmi/​reading_circle/​pydata-book-master/​ch02/"​ +  inpath = "/​Users/​so/​Dropbox/​bmi/​reading_circle/​pydata-book-master/​ch02/"​ 
- +   
-#? unclear +  #? unclear 
-#names = inpath + "​ch02/​names/"​ +  #names = inpath + "​ch02/​names/"​ 
-#print names.head(10) +  #print names.head(10) 
- +   
-#p35 +  ##p35 
-names1880 = pd.read_csv(inpath+'​names/​yob1880.txt',​ names=['​name',​ '​sex',​ '​births'​]) +  names1880 = pd.read_csv(inpath+'​names/​yob1880.txt',​ names=['​name',​ '​sex',​ '​births'​]) 
- +   
-#print names1880 +  #print names1880 
- +   
-#p36 +  ##p36 
- +   
-#print names1880.groupby('​sex'​).births.sum() +  #print names1880.groupby('​sex'​).births.sum() 
- +   
-years = range(1880, 2011) +  years = range(1880, 2011) 
- +   
-pieces = [] +  pieces = [] 
-columns = ['​name',​ '​sex',​ '​births'​] +  columns = ['​name',​ '​sex',​ '​births'​] 
- +   
-for year in years:+  for year in years:
     path = inpath+'​names/​yob%d.txt'​ % year     path = inpath+'​names/​yob%d.txt'​ % year
     frame = pd.read_csv(path,​ names=columns)     frame = pd.read_csv(path,​ names=columns)
Line 215: Line 215:
     frame['​year'​] = year     frame['​year'​] = year
     pieces.append(frame)     pieces.append(frame)
-names = pd.concat(pieces,​ ignore_index=True) +  ​names = pd.concat(pieces,​ ignore_index=True) 
- +   
-#p37 +  #p37 
- +   
-#print names +  #print names 
- +   
-total_births = names.pivot_table('​births',​ rows='​year',​ cols='​sex',​ aggfunc=sum) +  total_births = names.pivot_table('​births',​ rows='​year',​ cols='​sex',​ aggfunc=sum) 
- +   
-#print total_births.tail() +  #print total_births.tail() 
- +   
-def add_prop(group):​+  def add_prop(group):​
     births = group.births.astype(float)     births = group.births.astype(float)
 +  ​
     group['​prop'​] = births / births.sum()     group['​prop'​] = births / births.sum()
     ​     ​
-    #p38+    ​##p38
     return group     return group
-names = names.groupby(['​year',​ '​sex'​]).apply(add_prop) +  ​names = names.groupby(['​year',​ '​sex'​]).apply(add_prop) 
- +   
-#print names +  #print names 
- +   
-#p39 +  ##p39 
-#print np.allclose(names.groupby(['​year',​ '​sex'​]).prop.sum(),​ 1) +  #print np.allclose(names.groupby(['​year',​ '​sex'​]).prop.sum(),​ 1) 
- +   
-def get_top1000(group):​+  def get_top1000(group):​
     return group.sort_index(by='​births',​ ascending=False)[:​1000]     return group.sort_index(by='​births',​ ascending=False)[:​1000]
-grouped = names.groupby(['​year',​ '​sex'​]) +  ​grouped = names.groupby(['​year',​ '​sex'​]) 
-top1000 = grouped.apply(get_top1000) +  top1000 = grouped.apply(get_top1000) 
- +   
-#print top1000 +  #print top1000 
- +   
-pieces = [] +  pieces = [] 
-for year, group in names.groupby(['​year',​ '​sex'​]):​+  for year, group in names.groupby(['​year',​ '​sex'​]):​
     pieces.append(group.sort_index(by='​births',​ ascending=False)[:​1000])     pieces.append(group.sort_index(by='​births',​ ascending=False)[:​1000])
-top1000 = pd.concat(pieces,​ ignore_index=True) +  ​top1000 = pd.concat(pieces,​ ignore_index=True) 
- +   
-#print top1000 +  #print top1000 
- +   
-boys = top1000[top1000.sex == '​M'​] +  boys = top1000[top1000.sex == '​M'​] 
-girls = top1000[top1000.sex == '​F'​] +  girls = top1000[top1000.sex == '​F'​] 
- +   
-#p40 +  ##p40 
-total_births = top1000.pivot_table('​births',​ rows='​year',​ cols='​name',​ aggfunc=sum) +  total_births = top1000.pivot_table('​births',​ rows='​year',​ cols='​name',​ aggfunc=sum) 
- +   
-#print total_births +  #print total_births 
- +   
-subset = total_births[['​John',​ '​Harry',​ '​Mary',​ '​Marilyn'​]] +  subset = total_births[['​John',​ '​Harry',​ '​Mary',​ '​Marilyn'​]] 
- +   
-subset.plot(subplots=True,​ figsize=(12,​ 10), grid=False, title="​Number of births per year"​) +  subset.plot(subplots=True,​ figsize=(12,​ 10), grid=False, title="​Number of births per year"​) 
-#​plt.show() +  #​plt.show() 
- +   
-#p41 +  ##p41 
-table = top1000.pivot_table('​prop',​ rows='​year',​ cols='​sex',​ aggfunc=sum) +  table = top1000.pivot_table('​prop',​ rows='​year',​ cols='​sex',​ aggfunc=sum) 
-table.plot(title='​Sum of table1000.prop by year and sex', yticks=np.linspace(0,​ 1.2, 13), xticks=range(1880,​ 2020, 10)) +  table.plot(title='​Sum of table1000.prop by year and sex', yticks=np.linspace(0,​ 1.2, 13), xticks=range(1880,​ 2020, 10)) 
-#​plt.show() +  #​plt.show() 
- +   
-df = boys[boys.year == 2010] +  df = boys[boys.year == 2010] 
-#print df +  #print df 
- +   
-#p42 +  #p42 
-prop_cumsum = df.sort_index(by='​prop',​ ascending=False).prop.cumsum() +  prop_cumsum = df.sort_index(by='​prop',​ ascending=False).prop.cumsum() 
- +   
-#print prop_cumsum[:​10] +  #print prop_cumsum[:​10] 
- +   
-#ERROR, object type is different +  #ERROR, object type is different 
-#print np.searchsorted(prop_cumsum,​ 0.5) +  #print np.searchsorted(prop_cumsum,​ 0.5) 
-#print prop_cumsum.searchsorted(0.5) +  #print prop_cumsum.searchsorted(0.5) 
- +   
-df = boys[boys.year == 1900] +  df = boys[boys.year == 1900] 
-in1900 = df.sort_index(by='​prop',​ ascending=False).prop.cumsum() +  in1900 = df.sort_index(by='​prop',​ ascending=False).prop.cumsum() 
-#again ERROR! +  #again ERROR! 
-#print in1900.searchsorted(0.5) + 1 +  #print in1900.searchsorted(0.5) + 1 
- +   
-#p43 +  #p43 
-#def get_quantile_count(group,​ q=0.5): +  #def get_quantile_count(group,​ q=0.5): 
-#    group = group.sort_index(by='​prop',​ ascending=False) +  #    group = group.sort_index(by='​prop',​ ascending=False) 
-#    return group.prop.cumsum().searchsorted(q) + 1 +  #    return group.prop.cumsum().searchsorted(q) + 1 
- +   
-#diversity = top1000.groupby(['​year',​ '​sex'​]).apply(get_quantile_count) +  #diversity = top1000.groupby(['​year',​ '​sex'​]).apply(get_quantile_count) 
-#diversity = diversity.unstack('​sex'​) +  #diversity = diversity.unstack('​sex'​) 
- +   
-#​diversity.head() +  #​diversity.head() 
- +   
-#p44 +  ##p44 
-get_last_letter = lambda x: x[-1] +  get_last_letter = lambda x: x[-1] 
-last_letters = names.name.map(get_last_letter) +  last_letters = names.name.map(get_last_letter) 
-last_letters.name = '​last_letter'​ +  last_letters.name = '​last_letter'​ 
- +   
-table = names.pivot_table('​births',​ rows=last_letters,​ cols=['​sex',​ '​year'​],​ aggfunc=sum) +  table = names.pivot_table('​births',​ rows=last_letters,​ cols=['​sex',​ '​year'​],​ aggfunc=sum) 
- +   
-#print last_letters.head(5) +  #print last_letters.head(5) 
- +   
-#print last_letters.tail(5) +  #print last_letters.tail(5) 
- +   
-#p45 +  ##p45 
-subtable = table.reindex(columns=[1910,​ 1960, 2010], level='​year'​) +  subtable = table.reindex(columns=[1910,​ 1960, 2010], level='​year'​) 
- +   
-#print subtable.head() +  #print subtable.head() 
- +   
-#print subtable.sum() +  #print subtable.sum() 
- +   
-letter_prop = subtable / subtable.sum().astype(float) +  letter_prop = subtable / subtable.sum().astype(float) 
- +   
-fig, axes = plt.subplots(2,​ 1, figsize=(10,​ 8)) +  fig, axes = plt.subplots(2,​ 1, figsize=(10,​ 8)) 
-letter_prop['​M'​].plot(kind='​bar',​ rot=0, ax=axes[0], title='​Male'​) +  letter_prop['​M'​].plot(kind='​bar',​ rot=0, ax=axes[0], title='​Male'​) 
-letter_prop['​F'​].plot(kind='​bar',​ rot=0, ax=axes[1], title='​Female',​ legend=False) +  letter_prop['​F'​].plot(kind='​bar',​ rot=0, ax=axes[1], title='​Female',​ legend=False) 
- +   
-#​plt.show() +  #​plt.show() 
- +   
-letter_prop = table / table.sum().astype(float) +  letter_prop = table / table.sum().astype(float) 
-dny_ts = letter_prop.ix[['​d',​ '​n',​ '​y'​],​ '​M'​].T +  dny_ts = letter_prop.ix[['​d',​ '​n',​ '​y'​],​ '​M'​].T 
- +   
-#print dny_ts.head() +  #print dny_ts.head() 
- +   
-dny_ts.plot(style={'​d':'​-.',​ '​n':'​-',​ '​y':':'​}) +  dny_ts.plot(style={'​d':'​-.',​ '​n':'​-',​ '​y':':'​}) 
-#​plt.show() +  #​plt.show() 
- +   
-#p47 +  ##p47 
-all_names = top1000.name.unique() +  all_names = top1000.name.unique() 
-mask = np.array(['​lesl'​ in x.lower() for x in all_names]) +  mask = np.array(['​lesl'​ in x.lower() for x in all_names]) 
- +   
-lesley_like = all_names[mask] +  lesley_like = all_names[mask] 
- +   
-#print lesley_like +  #print lesley_like 
- +   
-filtered = top1000[top1000.name.isin(lesley_like)] +  filtered = top1000[top1000.name.isin(lesley_like)] 
-#print filtered.groupby('​name'​).births.sum() +  #print filtered.groupby('​name'​).births.sum() 
- +   
-#p48 +  ##p48 
-#print filtered.head(10) +  #print filtered.head(10) 
-#print filtered.tail(10) +  #print filtered.tail(10) 
- +   
-#p49 +  ##p49 
-table = filtered.pivot_table('​births',​ rows='​year',​ cols='​sex',​ aggfunc='​sum'​) +  table = filtered.pivot_table('​births',​ rows='​year',​ cols='​sex',​ aggfunc='​sum'​) 
-#print table.head(5) +  #print table.head(5) 
-#print table.tail(5) +  #print table.tail(5) 
- +   
-table = table.div(table.sum(1),​ axis=0)+  table = table.div(table.sum(1),​ axis=0
 +   
 +  #print table.tail() 
 +   
 +  table.plot(style={'​M':​ '​k-',​ '​F':​ '​k--'​}) 
 +  #plt.show()
  
-#print table.tail() 
  
-table.plot(style={'​M':​ '​k-',​ '​F':​ '​k--'​}) 
-#plt.show() 
chapter2.txt · Last modified: 2014-04-23 12:51 by so_nakagawa