Python开发之爬虫实战(续)

爬取淘宝商品

目标站点分析

  • 搜索关键字,利用selenium驱动浏览器搜索关键字,得到查询后的商品列表
  • 分析页码并翻页,得到商品页码数,模拟翻页,得到后续页码的商品列表
  • 分析提取商品内容,利用PyQuery分析源码,解析得到商品列表
  • 存储到MongoDB,将商品列表存储信息存储到数据库MongoDB。

爬虫代码

spider.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from pyquery import PyQuery as pq
from config import *
import pymongo

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)

def search():
    try:
        browser.get('https://www.taobao.com/')
        # 判断页面加载成功
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
        )
        # 提交按钮
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
        input.send_keys(KEY_WORD)
        submit.click()
        # 等待美食这个页面加载成功,总页数
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
        # 等待第一个页面加载完之后调用解析方法
        get_products()
        return total.text
    except  TimeoutError:
        # 如果出现错误,再次发送请求
        return search()

# 翻页处理
def next_page(page_number):
    try:
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
        )
        # 提交按钮
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
        input.clear() # 先清除输入框的内容
        input.send_keys(page_number)
        submit.click()
        # 判断是否翻页到指定页
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
        # 翻页成功之后再调用
        get_products()
    except TimeoutError:
        next_page(page_number)

# 解析
def get_products():
    # 判断页面加载成功
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
    html = browser.page_source # 拿到网页源代码
    doc = pq(html)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
            'image':item.find('.pic .img').attr('src'),
            'price':item.find('.price').text(),
            'deal':item.find('.deal-cnt').text()[:-3],
            'title':item.find('.title').text(),
            'shop':item.find('.shop').text(),
            'location':item.find('.location').text()
        }
        # print(product)
        save_to_mongo(product)

# 保存到MongoDB数据库
def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print('保存到MongoDB成功',result)
    except Exception:
        print('存储到MongoDB失败',result)

def main():
    try:
        total = search()
        # 得到总的页码数
        total = int(re.findall('\d+',total)[0])
        # print(total)
        # 循环从第二页开始
        for i in range(2,total + 1):
            next_page(i)
    except Exception:
        print('出错了')
    finally:
        # main()结束关闭浏览器
        browser.close()

if __name__ == '__main__':
    main()

配置文件config.py

MONGO_URL='192.168.8.128'
MONGO_DB='taobao'
MONGO_TABLE = 'product'

KEY_WORD='美食'

成功保存到MongoDB显示

 用Flask+Redis维护代理池

为什么要用代理池?

因为许多网站有专门的反爬虫措施,可能遇到封IP等问题。还有就是互联网上公开了大量免费代理,要利用好这些资源。通过定时的检测维护同样可以得到多个可用代理。

代理池的要求?

  • 多站抓取,异步检测
  • 定时筛选,持续更新
  • 提供接口,易于提取

代理池架构?

代理池的源码可参考github上的文章,详见https://github.com/Python3WebSpider/ProxyPool

 使用代理处理反爬抓取微信文章

具体流程:

  • 抓取索引页内容,利用requests请求目标站点,得到索引网页HTML代码,返回结果
  • 代理设置,如果遇到302状态码,则证明IP被封,切换代理重试
  • 分析详情页内容,请求详情页,分析得到标题,正文等内容
  • 将数据保存到MongoDB数据库
import requests
from urllib.parse import urlencode

from lxml.etree import XMLSyntaxError
from requests.exceptions import ConnectionError
base_url = 'http://weixin.sogou.com/weixin?'
from pyquery import PyQuery as pq
import pymongo

client = pymongo.MongoClient('localhost')
db = client['weixin']


headers = {
    'Cookie':'ABTEST=0|1534587111|v1; SNUID=C97E266C1D186DAFFAB826CE1E760218; IPLOC=CN4601; SUID=D4633B716E2F940A000000005B77F0E7; SUID=D4633B713320910A000000005B77F0E7; JSESSIONID=aaaNclsqE7zYShQvbDcvw; SUV=00E025A2713B63D45B77F0E8F8053932',
    'Host':'weixin.sogou.com',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}

keyword = '风景'
proxy_pool_url = 'http://127.0.0.1:5000/get'
proxy = None
max_count = 5

def get_proxy():
    try:
        response = requests.get(proxy_pool_url)
        if response.status_code==200:
            return response.text
        return None
    except ConnectionError:
        return None

def get_html(url, count=1):
    # 打印调试
    print('Crawling',url)
    print('Count',count)
    global proxy
    if count >= max_count:
        print('Tried Too Many Counts')
        return None
    try:
        if proxy:
            proxies = {
                'http':'http://' + proxy
            }
            response = requests.get(url,allow_redirects=False,headers=headers,proxies=proxies)
        else:
            response = requests.get(url, allow_redirects=False, headers=headers)
        if response.status_code == 200:
            return response.text
        if response.status_code == 302:
            # need proxy
            print(302)
            proxy = get_proxy()
            if proxy:
                print('Using Proxy',proxy)
                return get_html(url)
            else:
                print('get proxy failed')
                return None
    except ConnectionError as e:
        print('Error Occured',e.args)
        proxy = get_proxy()
        count += 1
        return get_html(url,count)

def get_index(keyword, page):
    data = {
        'query':keyword,
        'type':2,
        'page':page
    }
    queries = urlencode(data) # 把data转化成get请求参数的形式
    url = base_url+queries
    html = get_html(url)
    return html

# 解析部分
def parse_index(html):
    doc = pq(html)
    items = doc('.news-box .news-list li .txt-box h3 a').items()
    for item in items:
        yield item.attr('href')

def get_detail(url):
    try:
        response = requests.get(url)
        if response.status_code==200:
            return response.text
        return None
    except ConnectionError:
        return None

def parse_detail(html):
    try:
        doc = pq(html)
        title = doc('.rich_media_title').text()
        content = doc('rich_media_content').text()
        date = doc('#post-date').text()
        nickname = doc('.profile_nickname').text()
        wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        return {
            'title':title,
            'content':content,
            'date':date,
            'nickname':nickname,
            'wechat':wechat
        }
    except XMLSyntaxError:
        return None

def save_to_mongo(data):
    if db['articles'].update({'title':data['title']},{'$set':data},True):
        print('Saved to Monogo',data['title'])
    else:
        print('Saved to Monogo Failed',data['title'])

def main():
    for page in range(1,101):
        html = get_index(keyword, page)
        if html:
            article_urls = parse_index(html)
            for article_url in article_urls:
                article_html = get_detail(article_url)
                if article_html:
                    article_data = parse_detail(article_html)
                    print(article_data)
                    if article_data:save_to_mongo(article_data)

if __name__ == '__main__':
    main()

  

posted on 2018-08-18 21:00  Mr.Hui  阅读(115)  评论(0)    收藏  举报

导航