一个完整的大作业
爬取的网站:https://www.douban.com/group/explore/fashion
import requests
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import re
import sys
import jieba
import matplotlib.pyplot as plt
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
wangzhan='https://www.douban.com/group/explore/fashion'
res =requests.get(wangzhan)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
def getdetail(url):
resd =requests.get(url)
resd.encoding='utf-8'
soupd=BeautifulSoup(resd.text,'html.parser')
return (soupd.select('.topic-content')[1].text)
for news in soup.select('.channel-item'):
title=news.select('a')[0].text
url=news.select('a')[0]['href']
detail=getdetail(url)
det=detail.translate(non_bmp_map)
#print(title,url,det)
fo = open('baocun.txt', "ab+")
fo.write((det + '\r\n').encode('UTF-8'))
fo.close()
txt = open('baocun.txt',"r",encoding='utf-8').read()
ex = {}
ls = []
words = jieba.lcut(txt)
counts = {}
for word in words:
ls.append(word)
if len(word) == 1:
continue
else:
counts[word] = counts.get(word,0)+1
for word in ex:
del(counts[word])
items = list(counts.items())
items.sort(key = lambda x:x[1], reverse = True)
for i in range(20):
word , count = items[i]
print ("{:<10}{:>5}".format(word,count))
mywc = WordCloud().generate(word)
plt.imshow(mywc)
plt.axis("off")
plt.show()
遇到的问题:'UCS-2' codec can't encode characters in position 175-175
解决办法:
import sys non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) detail.translate(non_bmp_map)
词云图:

结果分析:可以看出女性在这里占的比例是很高的,讨论的都是有关保养方面的事情。
浙公网安备 33010602011771号