selenium爬取煎蛋网

selenium爬取煎蛋网

直接上代码

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions  as ES
import requests
import urllib.request
import os
from lxml import etree
t = 0   
class Custer(object):
    driver_path = r"D:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=self.driver_path)
        self.url = "http://jandan.net/ooxx"
    def run(self):
        self.driver.get(self.url)
        while True:
            all_source = self.driver.page_source
            html = etree.HTML(all_source)
            self.xqy(html)
            WebDriverWait(self.driver,10).until(
                ES.presence_of_element_located((By.XPATH,"//div[@class='cp-pagenavi']/a[last()]"))
            )
            try:
                Btn = self.driver.find_element_by_xpath("//div[@class='cp-pagenavi']/a[last()]")
                if "Older Comments" in Btn.get_attribute("title"):
                    Btn.click()
                else:
                    break
            except:
                print("出现异常")

    def xqy(self,html):
        all_content = html.xpath("//div[@class='row']//div")
        all_author = all_content[0].xpath("//div[@class='author']/strong/text()")           #作者列表

        #*****************给自己的重点**********************
        #给列表重复元素加工  如果不加工进入字典会少很多元素
        for index,item in enumerate(all_author):
            global t
            if item in all_author[0:index]:                     #判断当前元素是否与之前元素重复  如果重复,则重命名
                t=t+1                               
                all_author[index] = item+str(t)                 #如多个重命名使作者加上字符1  依次类推
        #***************************************************

        WebDriverWait(self.driver, 10).until(
            ES.presence_of_element_located((By.XPATH, "//div[@class='text']//img"))
        )
        all_img = all_content[1].xpath("//div[@class='text']//img//@src")           #图片列表
        #解决有个张图片没有http:协议
        for index,item in enumerate(all_img):
            if 'http:' not in item:
                all_img[index] = 'http:'+item

        dic = dict(zip(all_author,all_img))         #多个列表生产字典
        #遍历字典保存图片
        for key in dic:
            hz = os.path.splitext(dic[key])[1]          #取出后缀名.jpg/.png
            filename = key+hz                           #文件名(标题+后缀名)
            urllib.request.urlretrieve(dic[key],'images/'+filename)

def main():
    rea = Custer()
    rea.run()



if __name__ == '__main__':
    main()

爬取的图片

 

进阶

个人用了个多线程   但不知道是不是多线程爬取 感觉爬取速度快多了

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions  as ES
import requests
import threading
import urllib.request
import os
from lxml import etree
t = 0
gCondition = threading.Condition()
class Custer(threading.Thread):
    driver_path = r"D:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
    driver = webdriver.Chrome(executable_path=driver_path)
    url = "http://jandan.net/ooxx"
    def run(self):
        self.driver.get(self.url)
        while True:
            all_source = self.driver.page_source
            html = etree.HTML(all_source)
            self.xqy(html)
            WebDriverWait(self.driver,10).until(
                ES.presence_of_element_located((By.XPATH,"//div[@class='cp-pagenavi']/a[last()]"))
            )
            gCondition.acquire()        #加上锁(如果不加锁那么多个线程可能同时请求一个或多个图片)
            try:
                Btn = self.driver.find_element_by_xpath("//div[@class='cp-pagenavi']/a[last()]")
                if "Older Comments" in Btn.get_attribute("title"):
                    gCondition.release()            #解锁
                    Btn.click()
                else:
                    break

            except:
                print("出现异常")

    def xqy(self,html):
        all_content = html.xpath("//div[@class='row']//div")
        all_author = all_content[0].xpath("//div[@class='author']/strong/text()")           #作者列表

        #*****************给自己的重点**********************
        #给列表重复元素加工  如果不加工进入字典会少很多元素
        for index,item in enumerate(all_author):
            global t
            if item in all_author[0:index]:                     #判断当前元素是否与之前元素重复  如果重复,则重命名
                t=t+1
                all_author[index] = item+str(t)                 #如多个重命名使作者加上字符 依次类推
        #***************************************************

        WebDriverWait(self.driver, 10).until(
            ES.presence_of_element_located((By.XPATH, "//div[@class='text']//img"))
        )
        all_img = all_content[1].xpath("//div[@class='text']//img//@src")           #图片列表
        #解决有个张图片没有http:协议
        for index,item in enumerate(all_img):
            if 'http:' not in item:
                all_img[index] = 'http:'+item

        dic = dict(zip(all_author,all_img))         #多个列表生产字典
        #遍历字典保存图片
        for key in dic:
            hz = os.path.splitext(dic[key])[1]          #取出后缀名.jpg/.png
            filename = key+hz                           #文件名(标题+后缀名)
            urllib.request.urlretrieve(dic[key],'images/'+filename)

def main():
    for i in range(9):
        rea = Custer()
        rea.start()



if __name__ == '__main__':
    main()

 

 

posted @ 2019-03-27 19:17  cmap  阅读(250)  评论(0编辑  收藏  举报