#!/usr/bin/env python # -*- encoding: UTF-8 -*- # Copyright: Thuswise Ltd # Author: D. Haynes import os import sys from time import strftime, gmtime, strptime, mktime import datetime import numpy import pylab import cProfile, pstats import pack TICKS_IN_A_YEAR = 52 * 7 * 24 * 60 * 60 TICKS_IN_A_MONTH = TICKS_IN_A_YEAR / 12 def pathGen(): for i in os.listdir(sys.argv[1]): if i.endswith(".bin"): path = os.path.join(sys.argv[1], i) yield path def filmId(path): root = os.path.splitext(os.path.split(path)[1])[0] return int(root[3:]) def main(): d = [] for path in pathGen(): film = filmId(path) f = file(path, 'rb') data = numpy.array([i for i in pack.data(f)]) dmin = data.min(axis=0)[0] dmax = data.max(axis=0)[0] d.append(dmax-dmin) if film % 20 == 0: sys.stderr.write('.') f.close() sys.stderr.write('\n') pylab.xlabel("Span (years)") pylab.ylabel("Movies") pylab.title("Movie Lifetime for Netflix Data Set") n, b, p = pylab.hist(d, bins=144) ticks = b[::36] pylab.xticks(ticks,[ "%0.1f" % (i/TICKS_IN_A_YEAR) for i in ticks]) pylab.savefig('moviespan.png',dpi=72) pylab.show() return 0 # usage: votespan.py if __name__ == "__main__": cProfile.run("main()", "votespan.prf") stats = pstats.Stats("votespan.prf") stats.strip_dirs() stats.sort_stats("cumulative") stats.print_stats(12)