User Tools

Site Tools


chapter2

This is an old revision of the document!


Chapter2

2.1.py

#!/usr/bin/python
import json

path = '{}Dropbox/bmi/reading_circle/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'.format('/Users/so')

records = [json.loads(line) for line in open(path)]

##p16
#print records[0]
#print records[1]

##p17
#print records[0]['tz']

time_zones = [rec['tz'] for rec in records if 'tz' in rec]

##p18

#print time_zones[:10]


def get_counts(sequence):
  counts = {}
  for x in sequence:
      if x in counts:
          counts[x] += 1
      else:
          counts[x] = 1
  return counts

counts = get_counts(time_zones)
#print counts

from collections import defaultdict

def get_counts2(sequence):
  counts = defaultdict(int)
  for x in sequence:
      counts[x] += 1
  return counts

##p19
counts2 = get_counts2(time_zones)
#print counts2
#print counts2['America/New_York']
#print len(time_zones)

def top_counts(count_dict, n=10):
  value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
  value_key_pairs.sort()
  return value_key_pairs[-n:]

out_top_counts = top_counts(counts)
#print out_top_counts

from collections import Counter

counts = Counter(time_zones)
#print counts.most_common(10)

##p20
from pandas import DataFrame, Series
import pandas as pd; import numpy as np

frame = DataFrame(records)
#print frame

##p21
tz_counts = frame['tz'].value_counts()
#print tz_counts[:10]

##p22
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()

#print tz_counts[:10]

import matplotlib.pyplot as plt

tz_counts[:10].plot(kind='barh', rot=0)
#plt.show()

#print frame['a'][1]
#print frame['a'][50]
#print frame['a'][51]

##p23
results = Series([x.split()[0] for x in frame.a.dropna()])
#print results[:5]
#print results.value_counts()[:8]

cframe = frame[frame.a.notnull()]

##p24
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
#Function: where(condition, x, y)

#print operating_system[:5]

by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)

#print agg_counts[:10]

##p25
indexer = agg_counts.sum(1).argsort()

#print indexer[:10]

##p26
count_subset = agg_counts.take(indexer)[-10:]

#print count_subset

count_subset.plot(kind='barh', stacked=True)

#plt.show()

normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)

#plt.show()
chapter2.1398256259.txt.gz · Last modified: 2014-04-23 12:30 by so_nakagawa