import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from os import path
import jieba
from pyspark import SparkContext
from pyspark.sql import SQLContext
#from operator import add
sc = SparkContext("local[1]" , "wordCount")
sc.setLogLevel("ERROR")
sqc = SQLContext(sc)
thisDir = path.dirname(__file__)
def wordCut(strings):
strings = strings.strip()
returnList = []
for r in jieba.cut(strings):
returnList.append(r)
return returnList
fileName = 'words.txt'
file_in = sc.textFile(path.join(thisDir,fileName))
linesNum = file_in.count()
print '[INFO]number of lines in file %s : %d' % (fileName , linesNum)
charsNum = file_in.map(lambda x : len(x)).reduce(lambda x,y : x+y)
print '[INFO]number of charts in file %s : %d' % (fileName , charsNum)
words = file_in.flatMap(lambda line : wordCut(line))
termBigger3 = words.filter(lambda word : len(word) > 3)
print '[INFO]number of words bigger than 3 in file %s : %d' % (fileName , termBigger3.count())
wordCount = words.map(lambda w : (w,1)).reduceByKey(lambda x,y:x+y)
sqc.createDataFrame(wordCount,['word','count']).sort('count',ascending = False).show(20)