(18) python 爬虫实战

一切从最简单开始

峰绘网 :http://www.ifenghui.com/

一个比较好爬的漫画网，之所以选择这个网站，因为查看源代码能直接获得漫画的jpg连接，而且每一话所有的jpg一次性的都展示出来

人气排行榜最高的黑水潭

爬取单话

昆虫学家上 (28p)

#coding=utf-8 
import os
import  urllib
import  urllib2
from bs4 import BeautifulSoup

request = urllib2.Request("http://www.ifenghui.com/index/comics/read/chapterId/19352.html")
response = urllib2.urlopen(request)
html_=response.read()
soup=BeautifulSoup(html_,"lxml")
i=0
for a in soup.find_all(class_="fh-read-img"):
    i=i+1
    num=str(i)
    url = a.get('ssrc')
    if not os.path.exists("C:/manhua"):
        os.mkdir("C:/manhua")
    file_="C:/manhua/"+num+".jpg"
    urllib.urlretrieve(url,file_)
    print '第'+num+'张漫画下载OK'
print '下载完成'

把url换成其他话

昆虫学家下 (28p)

http://www.ifenghui.com/index/comics/read/chapterId/20560.html

也没问题

爬取整部漫画

#coding=utf-8
import os
import re
import  urllib
import  urllib2
from bs4 import BeautifulSoup

request = urllib2.Request("http://www.ifenghui.com/index/comics/manhua/id/3235.html")#漫画目录
response = urllib2.urlopen(request)
html_=response.read()
soup=BeautifulSoup(html_,"lxml")
title_=''
tit=soup.title.string #获得标题
for t in tit:
    if t==' ':
        break
    else:
        title_=title_+t#获得截取后的标题
findAll=soup.find_all('a',attrs={'href':re.compile('^/index/comics/read/chapterId')})
chapter=findAll[3:]#获得全部张节的list
chapter.reverse()#倒叙list
elementNum=len(chapter)#统计元素个数，用来循环
i=0
if not os.path.exists("D:/manhua"):
    os.mkdir("D:/manhua")#在D盘下新建一个文件夹
for eachChapter in chapter:
    i = i + 1
    chapterNum = str(i)  # 打印漫画下载到第几章
    chapterTitle=eachChapter.string#获得每章的标题名
    rootUrl='http://www.ifenghui.com'#根目录
    chapterUrl=rootUrl+eachChapter.get('href')#根目录+相对地址=每章完整的URL
    #print chapterTitle#打印每章打印标题名
    #print chapterUrl#打印每章标题链接的URL
    request = urllib2.Request(chapterUrl)
    response = urllib2.urlopen(request)
    html_=response.read()
    soup=BeautifulSoup(html_,"lxml")
    j=0#以下载图片的顺序重新命名图片名
    if not os.path.exists("D:/manhua/" + title_):  ##########chapterTitle
        os.mkdir("D:/manhua/" + title_)  # 以本章名新建文件夹##########chapterTitle
    for a in soup.find_all(class_="fh-read-img"):#
      j=j+1
      pictureNum = str(j)  # 打印漫画下载到第几张
      pictureUrl = a.get('ssrc')#获得本图片的URL网址
      if not os.path.exists("D:/manhua/"+title_+"/"+chapterNum):##########chapterTitle
          os.mkdir("D:/manhua/"+title_+"/"+chapterNum)#以本章名新建文件夹##########chapterTitle
      file_="D:/manhua/"+title_+"/"+chapterNum+"/"+pictureNum+".jpg"##########chapterTitle
      urllib.urlretrieve(pictureUrl,file_)#下载到本地，并重命名
      print '第'+chapterNum+'章的第'+pictureNum+'页漫画下载OK'
    print '第'+chapterNum+'章下载完成'
print '所有下载完成'

posted @ 2017-07-12 15:00 富坚老贼阅读(262) 评论(0) 收藏举报

刷新页面返回顶部

富坚老贼

https://github.com/buchizaodian

(18) python 爬虫实战

公告