爬虫案例
案例
1.亚马逊商品页面的爬取
import requests url="https://www.amazon.cn/gp/product/B01M8L5Z3Y" try: kv={'user-agent':'Mozilla/5.0'} #隐藏自己,设置编码 r=requests.get(url, headers=kv) #geturl的内容 r.raise_for_status() #判断异常 r.encoding=r.apparent_encoding#规定编码 print(r.text[1000:2000]) #字符输出 except : print("爬取失败")
2.百度360搜索关键词提交
百度的关键词接口
https://www.baidu.com/s?wd=keyword
360
https://www.so.com/s?q=keyword
import requests kv={'wd':'python'} try: r=requests.get("https://www.baidu.com/s",kv) r.status_code print(r.request.url) print(len(r.text)) except : print("异常") #百度还搞了一个验证
3.网络图片的储存与爬取
with使用详解 :https://blog.csdn.net/chenmozhe22/article/details/81434549
网络图片的链接格式:
http://www.example.com/picture.jpg
import requests import os url="http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg"#图片地址 root="D:/pics//" #储存位置 path=root+url.split('/')[-1] #为存储做准备 try: if not os.path.exists(root): #如果不存在这个root文件 os.mkdir(root) #那就新建一个 if not os.path.exists(path): #如果不存在这个组合的照片名 r=requests.get(url) #开始get with open(path,'wb') as f:#打开文件准备写入 f.write(r.content) #输入r。字节码,其中content是字节码。text是编译后的字节码 f.close() #关闭资源 print("文件保存成功") else: print("文件已经存在") except : print("爬取失败")
爬取豆瓣的案例
import requests r=requests.get('https://book.douban.com/subject/1084336/comments/').text from bs4 import BeautifulSoup soup = BeautifulSoup(r,'lxml') pattern=soup.find_all('p','comment-content') for item in pattern: print(item.string) import pandas comments=[] for item in pattern: comments.append(item.string) df=pandas.DataFrame(comments) df.to_csv('comments.csv')
最喜欢的,爬知乎案例
import requests import pandas as pd from lxml import etree import json import time header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4004.138 Safari/537.36"} user_data = [] def get_user_data(page): for i in range(page): url = 'https://www.zhihu.com/api/v4/members/excited-vczh/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20'.format(i * 20) r = requests.get(url, headers=header).json()['data'] user_data.extend(r) # 把response数据添加进user_data print('正在爬取第%s页' % str(i + 1)) time.sleep(2) if __name__ == '__main__': get_user_data(10) df = pd.DataFrame(user_data) df.to_excel('test001.xlsx')
爬取淘宝案例
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from pyquery import PyQuery as pq import re import pandas as pd #C:\Users\DELL\AppData\Local\Google\Chrome\Application browser = webdriver.Chrome("C:/Users/DELL/AppData/Local/Google/Chrome/Application/chromedriver.exe") wait = WebDriverWait(browser, 10) #进入淘宝网,输入鞋子,返回页面 user_data = [] def search(): try: browser.get('https://www.taobao.com/') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))) input.send_keys(u'鞋子') submit.click() total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) get_products() return total.text except TimeoutException: return search() #跳转到下一页 def next_page(page_number): try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page_number) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) get_products() except TimeoutException: next_page(page_number) #得到淘宝商品信息 def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) html = browser.page_source doc = pq(html) #pyquery (browser.page_source)就相当于requests.get获取的内容 items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'image':item.find('.pic .img').attr('src'), 'price':item.find('.price').text(), 'deal':item.find('.deal-cnt').text()[:-3], 'title':item.find('.title').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text(), } print(product) user_data.extend(product) def main(): total = search() total = int(re.compile('(\d+)').search(total).group(1))#爬取所有的数据用total+1 for i in range(2,4): next_page(i) if __name__ == '__main__': main() df = pd.DataFrame(user_data) df.to_excel('test001.xlsx')