一个完整的大作业

爬取的网站:https://www.douban.com/group/explore/fashion

import requests
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import re
import sys
import jieba
import matplotlib.pyplot as plt

non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)



wangzhan='https://www.douban.com/group/explore/fashion'
res =requests.get(wangzhan)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')

def getdetail(url):
    resd =requests.get(url)
    resd.encoding='utf-8'
    soupd=BeautifulSoup(resd.text,'html.parser')
    return (soupd.select('.topic-content')[1].text)




for news in soup.select('.channel-item'):
        title=news.select('a')[0].text
        url=news.select('a')[0]['href']
        detail=getdetail(url)
        det=detail.translate(non_bmp_map)

        #print(title,url,det)
        fo = open('baocun.txt', "ab+")
        fo.write((det + '\r\n').encode('UTF-8'))
        fo.close()
txt = open('baocun.txt',"r",encoding='utf-8').read()
ex = {}

ls = []
words = jieba.lcut(txt)
counts = {}
for word in words:
    ls.append(word)
    if len(word) == 1:
        continue
    else:
        counts[word] = counts.get(word,0)+1
for word in ex:
    del(counts[word])
    
items = list(counts.items())
items.sort(key = lambda x:x[1], reverse = True)
for i in range(20):
    word , count = items[i]
    print ("{:<10}{:>5}".format(word,count))

mywc = WordCloud().generate(word) 
    plt.imshow(mywc)
    plt.axis("off")
    plt.show()

  

遇到的问题:'UCS-2' codec can't encode characters in position 175-175

解决办法:

import sys
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
detail.translate(non_bmp_map)

  词云图:

结果分析:可以看出女性在这里占的比例是很高的,讨论的都是有关保养方面的事情。

posted on 2017-10-30 18:48  33陈思远  阅读(333)  评论(0)    收藏  举报