User Tools

Site Tools


chapter2

Table of Contents

Chapter2

2.1.py

#!/usr/bin/python
import json

path = '{}Dropbox/bmi/reading_circle/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'.format('/Users/so')

records = [json.loads(line) for line in open(path)]

##p16
#print records[0]
#print records[1]

##p17
#print records[0]['tz']

time_zones = [rec['tz'] for rec in records if 'tz' in rec]

##p18

#print time_zones[:10]


def get_counts(sequence):
  counts = {}
  for x in sequence:
      if x in counts:
          counts[x] += 1
      else:
          counts[x] = 1
  return counts

counts = get_counts(time_zones)
#print counts

from collections import defaultdict

def get_counts2(sequence):
  counts = defaultdict(int)
  for x in sequence:
      counts[x] += 1
  return counts

##p19
counts2 = get_counts2(time_zones)
#print counts2
#print counts2['America/New_York']
#print len(time_zones)

def top_counts(count_dict, n=10):
  value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
  value_key_pairs.sort()
  return value_key_pairs[-n:]

out_top_counts = top_counts(counts)
#print out_top_counts

from collections import Counter

counts = Counter(time_zones)
#print counts.most_common(10)

##p20
from pandas import DataFrame, Series
import pandas as pd; import numpy as np

frame = DataFrame(records)
#print frame

##p21
tz_counts = frame['tz'].value_counts()
#print tz_counts[:10]

##p22
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()

#print tz_counts[:10]

import matplotlib.pyplot as plt

tz_counts[:10].plot(kind='barh', rot=0)
#plt.show()

#print frame['a'][1]
#print frame['a'][50]
#print frame['a'][51]

##p23
results = Series([x.split()[0] for x in frame.a.dropna()])
#print results[:5]
#print results.value_counts()[:8]

cframe = frame[frame.a.notnull()]

##p24
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
#Function: where(condition, x, y)

#print operating_system[:5]

by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)

#print agg_counts[:10]

##p25
indexer = agg_counts.sum(1).argsort()

#print indexer[:10]

##p26
count_subset = agg_counts.take(indexer)[-10:]

#print count_subset

count_subset.plot(kind='barh', stacked=True)

#plt.show()

normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)

#plt.show()

2.2.py

#!/usr/bin/python
import pandas as pd

##p28
inpath = "{}Dropbox/bmi/reading_circle/pydata-book-master/ch02/movielens/".format('/Users/so/')

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(inpath+"users.dat", sep='::', header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(inpath+'ratings.dat', sep='::', header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(inpath+'movies.dat', sep='::', header=None, names=mnames)

#print users[:5]

#print ratings[:5]

##p29
#print movies[:5]

#print ratings

data = pd.merge(pd.merge(ratings, users), movies)
#print data

##p30
mean_ratings = data.pivot_table('rating', rows='title', cols='gender', aggfunc='mean')
#print mean_ratings[:5]

##p31
ratings_by_title = data.groupby('title').size()
#print ratings_by_title[:10]

active_titles = ratings_by_title.index[ratings_by_title >= 250]
#print active_titles

mean_ratings = mean_ratings.ix[active_titles]
#print mean_ratings

##p32
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)
#print top_female_ratings[:10]

mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_index(by='diff')
#print sorted_by_diff[:15]

#print sorted_by_diff[::-1][:15]

rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.ix[active_titles]
print rating_std_by_title.order(ascending=False)[:10]

2.3.py

#!/usr/bin/python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

##p34
inpath = "/Users/so/Dropbox/bmi/reading_circle/pydata-book-master/ch02/"

#? unclear
#names = inpath + "ch02/names/"
#print names.head(10)

##p35
names1880 = pd.read_csv(inpath+'names/yob1880.txt', names=['name', 'sex', 'births'])

#print names1880

##p36

#print names1880.groupby('sex').births.sum()

years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
  path = inpath+'names/yob%d.txt' % year
  frame = pd.read_csv(path, names=columns)
  
  frame['year'] = year
  pieces.append(frame)
names = pd.concat(pieces, ignore_index=True)

#p37

#print names

total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)

#print total_births.tail()

def add_prop(group):
  births = group.births.astype(float)

  group['prop'] = births / births.sum()
  
  ##p38
  return group
names = names.groupby(['year', 'sex']).apply(add_prop)

#print names

##p39
#print np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

def get_top1000(group):
  return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

#print top1000

pieces = []
for year, group in names.groupby(['year', 'sex']):
  pieces.append(group.sort_index(by='births', ascending=False)[:1000])
top1000 = pd.concat(pieces, ignore_index=True)

#print top1000

boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

##p40
total_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=sum)

#print total_births

subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]

subset.plot(subplots=True, figsize=(12, 10), grid=False, title="Number of births per year")
#plt.show()

##p41
table = top1000.pivot_table('prop', rows='year', cols='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
#plt.show()

df = boys[boys.year == 2010]
#print df

#p42
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()

#print prop_cumsum[:10]

#ERROR, object type is different
#print np.searchsorted(prop_cumsum, 0.5)
#print prop_cumsum.searchsorted(0.5)

df = boys[boys.year == 1900]
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()
#again ERROR!
#print in1900.searchsorted(0.5) + 1

#p43
#def get_quantile_count(group, q=0.5):
#    group = group.sort_index(by='prop', ascending=False)
#    return group.prop.cumsum().searchsorted(q) + 1

#diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
#diversity = diversity.unstack('sex')

#diversity.head()

##p44
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', rows=last_letters, cols=['sex', 'year'], aggfunc=sum)

#print last_letters.head(5)

#print last_letters.tail(5)

##p45
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')

#print subtable.head()

#print subtable.sum()

letter_prop = subtable / subtable.sum().astype(float)

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)

#plt.show()

letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T

#print dny_ts.head()

dny_ts.plot(style={'d':'-.', 'n':'-', 'y':':'})
#plt.show()

##p47
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])

lesley_like = all_names[mask]

#print lesley_like

filtered = top1000[top1000.name.isin(lesley_like)]
#print filtered.groupby('name').births.sum()

##p48
#print filtered.head(10)
#print filtered.tail(10)

##p49
table = filtered.pivot_table('births', rows='year', cols='sex', aggfunc='sum')
#print table.head(5)
#print table.tail(5)

table = table.div(table.sum(1), axis=0)

#print table.tail()

table.plot(style={'M': 'k-', 'F': 'k--'})
#plt.show()
chapter2.txt · Last modified: 2014-04-23 12:51 by so_nakagawa