python爬虫入门（一）

python爬虫总结：

爬取图片

import requests
from bs4 import BeautifulSoup

url="https://pixiv.re/91881094.jpg"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}

with open("./c.jpg", "wb") as f:
    response=requests.get(url,headers=headers).content
    f.write(response)

注意的是文件的模式wb，写二进制文件
content也是表示处理二进制文件
一般用于图片等处理

发送请求

输入英文单词，返回百度翻译内容

import json

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
s=input()
url="https://fanyi.baidu.com/sug"

data={
    'kw':s
}

response=requests.post(url,data,headers=headers)
dic=response.json()

with open("./d.json","w",encoding='utf-8') as f:
    json.dump(dic,f,ensure_ascii=False)

注意post的格式，中间的data要传递那些之通过负载的数据表单获得
最后返回的是json，注意保存的时候要ensure_ascii=False，因为有中文
get是用param

利用正则表达式

import requests
from bs4 import BeautifulSoup
import re
import os

url="https://movie.douban.com/chart"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}

html=requests.get(url=url,headers=headers).text



# <img src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p2897743122.webp" width="75" alt="伸冤人3" class="">
pattern=re.compile('<img src="(.*?)" width.*?alt')
src_list=pattern.findall(html,re.S)
if not os.path.exists("./pic"):
    os.makedirs("./pic")

tot=0;
for src in src_list:
    tot=tot+1
    url=src
    response=requests.get(url=url,headers=headers).content
    img_path="./pic/"+str(tot)+'.jpg'
    with open(img_path,"wb") as f:
        f.write(response)

比如爬取豆瓣某些电影的图片，先将整个网页爬下来，然后利用正则表达式找到图片对应的url，然后就跟第一个一样。
然后可以通过引入os模块

爬取json之后，通过字典或者列表访问所需url

import json

import requests
from bs4 import BeautifulSoup
import re
import os

url="https://movie.douban.com/j/chart/top_list"
param={
    'type': '24',
    'interval_id': '100:90',
    'action':'',
    'start': '20', #
    'limit': '20',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}

file_path='./pic_of_moive'
if not os.path.exists(file_path):
    os.makedirs(file_path)

tot=0


for id in range(0,40,20):
    param['start']=str(id)
    # print(param['start'])
    response=requests.get(url=url,params=param,headers=headers).json()
    for item in response:
        s=item['cover_url']
        pic=requests.get(url=s,headers=headers).content
        pic_path = file_path + '/' + str(tot) + '.jpg'
        tot=tot+1
        with open(pic_path,'wb') as f:
            f.write(pic)

解决中文乱码问题

import json

import requests
from bs4 import BeautifulSoup
import re
import os

url="https://sanguo.5000yan.com/965.html"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}

file_path='./pic_of_moive'
if not os.path.exists(file_path):
    os.makedirs(file_path)

tot=0

response=requests.get(url=url,headers=headers)
response.encoding="utf-8"


with open("./a.txt","w",encoding="utf-8") as f:
    f.write(response.text)

一般来说当html的编码格式是utf-8，并且文件的格式也为utf-8的话，一般是不会有问题的，但有些网页就会有问题，解决办法是像上面这样。

选择器空格问题

这样是会有问题的

a_list=soup.select('main-content container>ul>li>a')

解决办法是将空格改为".",并且在前面要加上标签的名称，而且必须加上标签才可以。

a_list=soup.select('main.main-content.container>ul>li>a')

bs4

import json

import requests
from bs4 import BeautifulSoup
import re
import os

url="https://sanguo.5000yan.com/"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}

response=requests.get(url=url,headers=headers)
response.encoding='utf-8'
html_text=response.text

soup=BeautifulSoup(html_text,'lxml')
a_list=soup.select('main.main-content.container>ul>li>a')

with open("./sanguo.txt","w",encoding='utf-8') as f:
    for a in a_list:
        title=a.string
        detail_src=a['href']
        # print(title)
        # print(detail_src)


        detail_response=requests.get(url=detail_src,headers=headers)
        detail_response.encoding='utf-8'

        detail_text=detail_response.text
        # f.write(detail_text)
        detail_soup=BeautifulSoup(detail_text,'lxml')
        content=detail_soup.find('div',class_='grap').text
        coutent=content.replace('\n', '')
        f.write(title+':'+content)


    # print(detail_src)



# soup=BeautifulSoup(response,'lxml')
#
# li_list=soup.findAll('.menu-item>a')
#
#
#

可以通过select找到对应的标签
标签的属性[]
标签的内容用text/string
text是下面所有的
string是直属的

Xpath

/表示层级关系，开头表示从根节点开始定位
//表示多层关系，从任意位置定位
[@class=content]属性定位
索引定位从1开始
/text返回直系标签内容 //text返回下面所有返回的都是列表
/@返回属性列表
局部解析 ./ 从"."开始后面可以继续接xpath表达式
同时写代码前可以先在control+f搜索一下，看xpath表达式找到的是不是想要的

import json
from lxml import etree

import requests
import re
import os

url="https://www.58.com/ershoufang/"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}

response=requests.get(url=url,headers=headers).text
tree=etree.HTML(response)

a_list=tree.xpath('//td[@class="t"]/a')

with open("./a.txt","w",encoding='utf-8') as f:
    for a in a_list:
        title=a.xpath("./text()")[0]
        f.write(title+'\n')

tbody

遇到tbody去掉即可，开始还很疑惑为什么是空列表，看到编辑器在tbody下画了线，去查了一下，还真是tbody的问题，还好没有浪费太多时间。

import json
from lxml import etree

import requests
import re
import os

url="https://www.58.com/ershoufang/"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}

response=requests.get(url=url,headers=headers).text
tree=etree.HTML(response)

tr_list=tree.xpath('//table[@border="0"]/tr')

for tr in tr_list:
    title=tr.xpath("./td[@class='t']/a/text()")[0]
    money=tr.xpath("./td[@class='pred'][1]/b/text()")[0]
    print(title+" "+money+"\n")

xpath结合分页批量下载图片

import json
from lxml import etree
from fake_useragent import UserAgent
import requests
import re
import os

# http://www.netbian.com/fengjing/index_2.htm
s="http://www.netbian.com/fengjing/"


ua=UserAgent()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}

topfile_path="./downpic"

for page in range(9,10):
        file_path=topfile_path+f'/page{page}'
        if not os.path.exists(file_path):
            os.makedirs(file_path)

        tot=0
        url=s+f"index_{page}.htm"

        headers['User-Agent']=ua.random
        response=requests.get(url=url,headers=headers).text
        tree=etree.HTML(response)
        a_list=tree.xpath("//div[@class='list']/ul/li/a")

        for a in a_list:
            img_src=a.xpath("./img/@src")[0]

            tot=tot+1

            headers['User-Agent'] = ua.random
            content=requests.get(url=img_src,headers=headers).content
            save_path=file_path+'/'+str(tot)+'.jpg'

            with open(save_path, "wb") as f:
                f.write(content)

可以通过fake_useragent每次搞一个新的ua

ip代理、解决gbk乱码问题

import json
from lxml import etree
from fake_useragent import UserAgent
import requests
import re
import os
import time
import random


ua = UserAgent()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
}
# proxy_list=[{'http': 'http://60.205.132.71:80', 'https': 'https://60.205.132.71:80'},
# {'http': 'http://183.236.232.160:8080', 'https': 'https://183.236.232.160:8080'}]


def get_pageid():
    return int(random.random()*100)+1


def get_newip():
    # time.sleep(3)

    page_id = random.randint(1,50)
    ip_url = f"https://www.kuaidaili.com/free/inha/{page_id}"
    # print(page)

    ua = UserAgent()
    ip_text = requests.get(url=ip_url,headers=headers).text

    tree = etree.HTML(ip_text)

    ip_list = tree.xpath('//div[@class="table-section"]//td[@data-title="IP"]')
    post_list = tree.xpath('//div[@class="table-section"]//td[@data-title="PORT"]')

    for a, b in zip(ip_list,post_list):
        ip = a.xpath('./text()')[0].strip()
        port = b.xpath('./text()')[0].strip()
        proxy_list.append({
            'ip': ip,
            'port': port
        })


def get_proxy(ele):
    return {
        "http": "http://" + ele["ip"] + ":" + ele["port"],
        "https": "https://" + ele["ip"] + ":" + ele["port"]
    }


def ask(surl):
    id = random.randint(0, len(proxy_list)-1)
    aproxy = get_proxy(proxy_list[id])
    headers['User-Agent'] = ua.random

    print(aproxy)

    try:
        r = requests.get(url=surl, headers=headers, proxies=aproxy, timeout=3)

        if r.status_code == 200:
            return 1, r
        else:
            print("fail!")
            return 0, 0
    except:
        print("fail!")
        proxy_list.remove(proxy_list[id])
        if len(proxy_list) < 5:
            get_newip()
        return 0, 0


def work(surl):
    m_state, m_response = ask(surl)
    while m_state == 0:
        m_state, m_response = ask(surl)

    return m_response


proxy_list = []
get_newip()
url = "http://www.netbian.com/jianzhu/index.htm"
response=work(url)
content=response.text
content=content.encode('iso-8859-1').decode('gbk')
print(content)

当我们的爬取次数多了起来之后，我们也开始担心我们的ip会不会被封
所以我们不断更换ip，模拟浏览器访问。
具体来说就是从快代理上获取ip，然后存到proxy_list中，每次随机取一个，如果不行就删除，当列表大小小于5就新爬一页ip

通过content=content.encode('iso-8859-1').decode('gbk')解决gbk乱码问题

不过说实话，由于是免费的，很多ip都用不了，往往要跑十多个甚至几十个。

posted @ 2023-10-14 23:15 gan_coder 阅读(20) 评论(0) 收藏举报

刷新页面返回顶部

ganking