一个完整的大作业

1.选一个自己感兴趣的主题。

2.网络上爬取相关的数据。

3.进行文本分析,生成词云。

4.对文本分析结果解释说明。

5.写一篇完整的博客,附上源代码、数据爬取及分析结果,形成一个可展示的成果。

import requests
from bs4 import BeautifulSoup
import re
import pandas

def getonecomment(username,js_text):
    comment = {}
    comment['username'] = username
    if(re.findall('[{}]","userClient.*?content":"(.*?)","creationTime'.format(username),js_text)):
        comment['comment'] = re.findall('[{}]","userClient.*?content":"(.*?)","creationTime'.format(username),js_text)[0]
        comment['time'] = re.findall('[{}]","userClient.*?creationTime":"(.*?)","isTop'.format(username),js_text)[0]
        f0 = open('jd.txt','a')
        f0.write(re.findall('[{}]","userClient.*?content":"(.*?)","creationTime'.format(username),js_text)[0].text)
        f0.close()
    else:
        pass
    return comment

def getpagecomments(js_text):
    pagecomments = []
    for username in re.findall('false,"nickname":"(.*?)","userClient',js_text):
        pagecomments.append(getonecomment(username,js_text))
    return pagecomments

def getcomments(url):
    url_id = re.search('.*/(.*).html',url).groups(0)[0]
    commentsls = []
    for i in range(30):
        js_text = requests.get("https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4635&productId={}&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1".format(url_id,i)).text
        commentsls.extend(getpagecomments(js_text))
    return commentsls
    
    


url_main='https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BA&enc=utf-8&suggest=1.rem.0.undefined&wq=%E5%8D%8E%E4%B8%BA&pvid=a30781dea7a8409aba07c6c86bb320ad'
res = requests.get(url_main)
res.encoding = 'UTF-8'

soup = BeautifulSoup(res.text,'html.parser')

commentstotal = []
for i in soup.select('li'):
    if len(i.select('.gl-i-wrap'))>0:
        url_page = "https:" + i.select('a')[0]['href']
        commentstotal.extend(getcomments(url_page))
        break

df = pandas.DataFrame(commentstotal)
df.to_excel('jd.xlsx')
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

f1 = open('jd.txt','r',encoding='utf-8')
jd = f1.read()
f1.close()

words = list(jieba.cut(jd))

ul={'那个', '', '整个', '','','', '我们','','','','', '',\
    '怎么', '能够','','他们', '你们','知道', '什么','', '一个','','',\
    '','没有','已经','就是','可以','','','这个','','','', '说道',\
    '','', '','','','','','','','','','', '','',\
    '','','','', '','','','','','','','','','',''\
    ,'','','','',' ','\u3000','','\n','/','"',"'",',',':','.','=','>',\
    '<','div','class','\\','n',''}
dic={}

keys = set(words)-ul
for i in keys:
    dic[i]=words.count(i)

c = list(dic.items())
c.sort(key=lambda x:x[1],reverse=True)

f1 = open('词云.txt','w')
for i in range(20):
    print(c[i])
    for words_count in range(c[i][1]):
        f1.write(c[i][0]+' ')
f1.close()

f3 = open('词云.txt','r')
cy_file = f3.read()
f3.close()
cy = WordCloud().generate(cy_file)
plt.imshow(cy)
plt.axis("off")
plt.show()

 

 

 

posted @ 2017-11-02 20:23  12-张振勋  阅读(222)  评论(0编辑  收藏  举报