爬虫案例

案例

1.亚马逊商品页面的爬取

 
import requests
 url="https://www.amazon.cn/gp/product/B01M8L5Z3Y"
 try:
     kv={'user-agent':'Mozilla/5.0'}  #隐藏自己,设置编码
     r=requests.get(url, headers=kv)  #geturl的内容
     r.raise_for_status()             #判断异常
     r.encoding=r.apparent_encoding#规定编码
     print(r.text[1000:2000])  #字符输出
 except :
     print("爬取失败")

 

2.百度360搜索关键词提交

百度的关键词接口

https://www.baidu.com/s?wd=keyword

360

https://www.so.com/s?q=keyword

 import requests
 kv={'wd':'python'}
 try:
     r=requests.get("https://www.baidu.com/s",kv)
     r.status_code
     print(r.request.url)
     print(len(r.text))
 except :
     print("异常")
                        #百度还搞了一个验证

 

3.网络图片的储存与爬取

with使用详解 :https://blog.csdn.net/chenmozhe22/article/details/81434549

网络图片的链接格式:

http://www.example.com/picture.jpg

 import requests
 import os
 url="http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg"#图片地址
 root="D:/pics//"      #储存位置
 path=root+url.split('/')[-1]  #为存储做准备
 try:
     if not os.path.exists(root):    #如果不存在这个root文件
         os.mkdir(root)              #那就新建一个
     if not os.path.exists(path):    #如果不存在这个组合的照片名
         r=requests.get(url)        #开始get
         with open(path,'wb') as f:#打开文件准备写入
             f.write(r.content)    #输入r。字节码,其中content是字节码。text是编译后的字节码
             f.close()            #关闭资源
             print("文件保存成功")
     else:
         print("文件已经存在")
 except :
     print("爬取失败")

 

爬取豆瓣的案例

 import requests
 ​
 r=requests.get('https://book.douban.com/subject/1084336/comments/').text
 ​
 from bs4 import BeautifulSoup
 soup = BeautifulSoup(r,'lxml')
 pattern=soup.find_all('p','comment-content')
 for item in pattern:
     print(item.string)
 ​
 import pandas
 comments=[]
 for item in pattern:
     comments.append(item.string)
 df=pandas.DataFrame(comments)
 df.to_csv('comments.csv')
 ​

 

最喜欢的,爬知乎案例

 import requests
 import pandas as pd
 from lxml import etree
 import json
 import time
 header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4004.138 Safari/537.36"}
 user_data = []
 def get_user_data(page):
     for i in range(page):
         url = 'https://www.zhihu.com/api/v4/members/excited-vczh/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20'.format(i * 20)
         r = requests.get(url, headers=header).json()['data']
         user_data.extend(r)  # 把response数据添加进user_data
         print('正在爬取第%s页' % str(i + 1))
         time.sleep(2)
 if __name__ == '__main__':
     get_user_data(10)
     df = pd.DataFrame(user_data)
     df.to_excel('test001.xlsx')

 爬取淘宝案例

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import re
import pandas as pd
                                                                                       #C:\Users\DELL\AppData\Local\Google\Chrome\Application
browser = webdriver.Chrome("C:/Users/DELL/AppData/Local/Google/Chrome/Application/chromedriver.exe")
wait = WebDriverWait(browser, 10)      #进入淘宝网,输入鞋子,返回页面
user_data = []
def search():
    try:
        browser.get('https://www.taobao.com/')
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))
        input.send_keys(u'鞋子')
        submit.click()
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
        get_products()
        return total.text
    except TimeoutException:
        return search()                    #跳转到下一页
def next_page(page_number):
    try:
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
        get_products()
    except TimeoutException:
        next_page(page_number)                    #得到淘宝商品信息
def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
    html = browser.page_source
    doc = pq(html)
    #pyquery (browser.page_source)就相当于requests.get获取的内容
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
            'image':item.find('.pic .img').attr('src'),
            'price':item.find('.price').text(),
            'deal':item.find('.deal-cnt').text()[:-3],
            'title':item.find('.title').text(),
            'shop':item.find('.shop').text(),
            'location':item.find('.location').text(),
        }
        print(product)
        user_data.extend(product)

def main():
    total = search()
    total = int(re.compile('(\d+)').search(total).group(1))#爬取所有的数据用total+1
    for i in range(2,4):
        next_page(i)

if __name__ == '__main__':
    main()
    df = pd.DataFrame(user_data)
    df.to_excel('test001.xlsx')

 

 

posted @ 2020-05-20 21:20  我是小杨  阅读(267)  评论(0)    收藏  举报