本文共 8296 字,大约阅读时间需要 27 分钟。
https://grouplens.org/datasets/movielens
u.user用户属性文件
包含user.id用户ID gender性别 occupation职业 ZIP code邮编等属性,每个属性之间用|分割
u.item电影元数据
包含movie.id电影ID title电影标题 release date电影上映日期 IMDB link 电影分类向量等属性,每个属性之间用|分割
u.data用户对电影的评级
包含user.id用户ID movie.id电影ID rating评分(从1-5) timestamp时间戳等属性,每个属性之间用制表符\t分割
使用jupyter notebook进入编辑器
from pyspark import SparkContext#导入Spark上下文sc = SparkContext("local","movielens")#初始化Spark上下文,指定master为local,即本地运行,应用名称为movielensuser_data = sc.textFile("file:///home/chenjie/ml-100k/u.user")#加载本地movielens文件中的用户信息文件,file://开头,后接本地文件路径;也可上传至HDFS,hdfs://192.168.1.101:9000/ml-100k/u.useruser_data.first()#输出第一行#u'1|24|M|technician|85711'#用户信息文件包含 用户ID|年龄|性别|职业|邮编user_fields = user_data.map(lambda line: line.split("|"))#将用户信息文件的每一行以|为分隔符【分开】num_users = user_fields.map(lambda fields: fields[0]).count()#将用户信息文件的用户ID列取出,并且【计算总数】,得到用户数目num_genders = user_fields.map(lambda fields: fields[2]).distinct().count()#将用户信息文件的性别列取出,并进行【去重】,并且计算总数,得到性别数目num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count()#将用户信息文件的职业列取出,并进行去重,并且计算总数,得到职业数目num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count()print "Users: %d, genders: %d, occupations: %d, ZIP codes: %d" % (num_users, num_genders, num_occupations, num_zipcodes)#输出上述信息#Users: 943, genders: 2, occupations: 21, ZIP codes: 795ages = user_fields.map(lambda x : int(x[1])).collect()#将用户信息文件的年龄列取出import matplotlib.pyplot#导入pyplot库matplotlib.pyplot.hist(ages,bins=20,color='lightblue',normed=True)#画直方图,参数列表如下"""matplotlib.pyplot.hist(x, bins=None, range=None, normed=False, weights=None, cumulative=False, bottom=None, histtype=’bar’, align=’mid’, orientation=’vertical’, rwidth=None, log=False, color=None, label=None, stacked=False, hold=None, data=None, **kwargs)Parameters:x : (n,) array or sequence of (n,) arrays(可以是一个array也可以是多个array)integer or array_like or ‘auto’, optional(可以是整型来设置箱子的宽度,也可以是array,指定每个箱子的宽度)range : tuple or None, optional(设置显示的范围,范围之外的将被舍弃)normed : boolean, optional(?)weights : (n, ) array_like or None, optional(?)cumulative : boolean, optional(?)bottom : array_like, scalar, or None(?)histtype : {‘bar’, ‘barstacked’, ‘step’, ‘stepfilled’}, optional(选择展示的类型,默认为bar)align : {‘left’, ‘mid’, ‘right’}, optional(对齐方式)orientation : {‘horizontal’, ‘vertical’}, optional(箱子方向)log : boolean, optional(log刻度)color : color or array_like of colors or None, optional(颜色设置)label : string or None, optional(刻度标签)stacked : boolean, optional(?)returnn : array or list of arrays(箱子的值)bins : array(箱子的边界)patches : list or list of lists"""fig = matplotlib.pyplot.gcf()#得到一个当前画图的引用fig.set_size_inches(16,10)"""fig.set_size_inches(w,h,forward=False)atplotlib 包中提供的函数,用于设置图形的尺寸,单位为英寸。1英寸等于 2.54 cm。参数forward = True表示自动更新画布大小。"""matplotlib.pyplot.show()#显示
下面进行用户职业分布图,可以使用map+reduce,也可以使用countByValue函数
count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y : x+y).collect()#统计每个职业的总数import numpy as npx_axis1 = np.array([c[0] for c in count_by_occupation])#将python数组转为numpy数组y_axis1 = np.array([c[1] for c in count_by_occupation])x_axis1 = x_axis1[np.argsort(y_axis1)]y_axis1 = y_axis1[np.argsort(y_axis1)]#argsort以数量升序从各数组中选取元素pos = np.arange(len(x_axis1))#np.arange(5)返回 array([0,1,2,3,4])width = 1.0ax = matplotlib.pyplot.axes()#向图中添加一个轴ax.set_xticks(pos + (width / 2))ax.set_xticklabels(x_axis1)#要修改X轴的刻度,最简单的办法是使用set_xticks和set_xticklabels。前者告诉matplotlib要将刻度放在数据范围中的哪些位置,默认情况下,这些位置也就是刻度标签。但我们可以通过#set_xticklabels将任何其他的值用作标签matplotlib.pyplot.bar(pos,y_axis1,width,color='green' )matplotlib.pyplot.xticks(rotation=30)fig = matplotlib.pyplot.gcf()fig.set_size_inches(16,10)matplotlib.pyplot.show()count_by_occupation2 = user_fields.map(lambda fields: (fields[3], 1)).countByValue()print "Map-reduce approach:"print dict(count_by_occupation2)print """""Map-reduce approach:{(u'homemaker', 1): 7, (u'marketing', 1): 26, (u'healthcare', 1): 16, (u'administrator', 1): 79, (u'doctor', 1): 7, (u'writer', 1): 45, (u'salesman', 1): 12, (u'librarian', 1): 51, (u'other', 1): 105, (u'lawyer', 1): 12, (u'engineer', 1): 67, (u'programmer', 1): 66, (u'entertainment', 1): 18, (u'artist', 1): 28, (u'none', 1): 9, (u'executive', 1): 32, (u'educator', 1): 95, (u'technician', 1): 27, (u'student', 1): 196, (u'scientist', 1): 31, (u'retired', 1): 14}"""print "countByValue approach:"print dict(count_by_occupation)print """""countByValue approach:{u'administrator': 79, u'executive': 32, u'retired': 14, u'doctor': 7, u'entertainment': 18, u'marketing': 26, u'writer': 45, u'none': 9, u'healthcare': 16, u'scientist': 31, u'homemaker': 7, u'student': 196, u'educator': 95, u'technician': 27, u'librarian': 51, u'programmer': 66, u'artist': 28, u'salesman': 12, u'other': 105, u'lawyer': 12, u'engineer': 67}"""
下面进行电影年龄分析
注意到电影数据中有些数据不归整,需要进行解析处理,如缺失年份的情况下将其设置为1900,然后后续处理中过滤掉这些数据
movie_data = sc.textFile("file:///home/chenjie/ml-100k/u.item")print movie_data.first()#1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0num_movies = movie_data.count()print "Movie: %d" % num_movies#Movie: 1682def conver_year(x): try: return int(x[-4:]) except : return 1900 #若数据缺失年份则将其设为1900。在后续处理中会过滤掉这类数据movie_fields = movie_data.map(lambda lines: lines.split("|"))years = movie_fields.map(lambda fields: fields[2]).map(lambda x : conver_year(x))years_filtered = years.filter(lambda x : x != 1900)movie_ages = years_filtered.map(lambda yr: 2017-yr).countByValue()values = movie_ages.values()bins = movie_ages.keys()matplotlib.pyplot.hist(values, bins=bins, color='green', normed=True)fig = matplotlib.pyplot.gcf()fig.set_size_inches(16,10)matplotlib.pyplot.show()
电影的年龄分布
自己实现统计功能或者使用states函数
rating_data = sc.textFile("file:///home/chenjie/ml-100k/u.data")print rating_data.first()num_ratings = rating_data.count()print "评分:%d条" % num_ratings#196 242 3 881250949#评分:100000条rating_data_fields = rating_data.map(lambda line : line.split("\t"))ratings = rating_data_fields.map(lambda fields : int(fields[2]))max_rating = ratings.reduce(lambda x, y : max (x,y))min_rating = ratings.reduce(lambda x,y : min(x,y))mean_rating = ratings.reduce(lambda x,y : x+y) / num_ratings ating_data = sc.textFile("file:///home/chenjie/ml-100k/u.data")print rating_data.first()num_ratings = rating_data.count()print "评分:%d条" % num_ratings#196 242 3 881250949#评分:100000条rating_data_fields = rating_data.map(lambda line : line.split("\t"))ratings = rating_data_fields.map(lambda fields : int(fields[2]))max_rating = ratings.reduce(lambda x, y : max (x,y))min_rating = ratings.reduce(lambda x,y : min(x,y))mean_rating = ratings.reduce(lambda x,y : x+y) / num_ratings median_rating = np.median(ratings.collect())user_data = sc.textFile("file:///home/chenjie/ml-100k/u.user")user_fields = user_data.map(lambda line: line.split("|"))num_users = user_fields.map(lambda fields: fields[0]).count()ratings_per_user = num_ratings / num_usersmovie_data = sc.textFile("file:///home/chenjie/ml-100k/u.item")num_movies = movie_data.count()ratings_per_movie = num_ratings / num_moviesprint max_ratingprint min_ratingprint mean_ratingprint median_ratingprint ratings_per_userprint ratings_per_movieratings.stats()#Spark自带统计函数count_by_rating = ratings.countByValue()x_axis = np.array(count_by_rating.values())y_axis = np.array([float(c) for c in count_by_rating.values()])y_axis_normed = y_axis / y_axis.sum()pos = np.arange(len(x_axis))width = 1.0import matplotlib.pyplot as pltplt.bar(pos, y_axis_normed, width, color='green')plt.xticks(rotation=30)fig = plt.gcf()fig.set_size_inches(16, 10)plt.show()
电影评级分布
rating_data = sc.textFile("file:///home/chenjie/ml-100k/u.data")print rating_data.first()rating_data_fields = rating_data.map(lambda line : line.split("\t"))print rating_data_fields.first()user_ratings_grouped = rating_data_fields.map(lambda fields : ( int (fields[0]), int(fields[2]) ) ).groupByKey()user_ratings_buuser = user_ratings_grouped.map(lambda (k,v) : (k, len(v)))user_ratings_buuser.take(5)user_ratings_buuser_local = user_ratings_buuser.map(lambda (k,v) : v).collect()import matplotlib.pyplot as pltplt.hist(user_ratings_buuser_local, bins=200, color='green', normed=True)fig = plt.gcf()fig.set_size_inches(16,10)plt.show()
各用户的电影评级分布图