爬虫大作业

import requests
from bs4 import BeautifulSoup


def catchSoup(url):
#url=‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml/‘
res=requests.get(url)
res.encoding=‘utf-8‘
soup=BeautifulSoup(res.text,‘html.parser‘)
return soup

def kindSearch(soup):
herbKind=[]
for new in soup.select(‘li‘):
if(new.text!=‘首页‘):
perKind=[]
perKind.append(new.text)
perKind.append(new.select(‘a‘)[0].attrs[‘href‘])
herbKind.append(perKind)
return herbKind


def nameSearch(soup):
herbName=[]
for new in soup.select(‘h3‘):
pername=new.text.split(‘_‘)[0].rstrip(‘图片‘).lstrip(‘\xa0‘)
pername=pername.rstrip(‘读书‘)
herbName.append(pername)
return herbName

def perPage(soup):
kindPage=[]
add=[]
for new in soup.select(‘.post.pagebar‘):
for detail in new.select(‘a‘):
d=[]
d.append(detail.text)
d.append(detail.attrs[‘href‘])
kindPage.append(d)
kindPage.remove(kindPage[0])
kindPage.remove(kindPage[-1])
return kindPage
def herbDetail(kind):
soup=catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml‘)
kindName=kindSearch(soup)[kind][0]
adds=kindSearch(soup)[kind][1]
totalRecord = []
print("正在爬取 "+str(kind)+‘.‘+kindName)
totalRecord.append(nameSearch(catchSoup(adds)))
for add in perPage(catchSoup(adds)):
pageAdd=add[1]
totalRecord.append(nameSearch(catchSoup(pageAdd)))
#print(nameSearch(catchSoup(pageAdd)))
print(totalRecord)
return totalRecord


if __name__=="__main__":
totalKind=kindSearch(catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-
ifzqvvsa2785251.shtml‘))
totalRecord=[]
kind=0
detailContent = ‘‘
while(kind<20):
totalRecord=herbDetail(kind)
if(kind==0):
detailContent+=‘目录:\n‘
for i in totalKind:
detailContent+=str(totalKind.index(i)+1)+‘.‘+i[0]+‘ ‘
kind+=1
continue
else:
detailContent+=‘\n‘+str(totalKind[kind][0])+‘:\n‘
for i in totalRecord:
detailContent+=str(totalRecord.index(i)+1)+‘.‘+i[0]+‘ ‘
kind+=1

f = open(‘herbDetail.txt‘, ‘a+‘,encoding=‘utf-8‘)
f.write(detailContent)
f.close()

posted @ 2018-04-30 19:11  153刘宇  阅读(142)  评论(0编辑  收藏  举报