本套是一个可以随时查看的爬虫模板，方便写爬虫时候用

1.本套是一个可以随时查看的爬虫模板，方便写爬虫时候用

urllib,request模型

from urlli.request import Request, urlopen
from urllib.paarse import urlencode
url = ''
headers = {
    'user-agent':'',
    'cookie':''
    'x-requested-with':''
    
}
def 函数名():
    request = Request(url=url,headers=headers)
	response = urlopen(request) #发起请求
	assert response.code == 200
    resp_text = response.read() # 读取数据
    with open('文件名','wb') as f: # 写入数据,如果是图片写wb，如果是文本，w即可
        f.write(resp_text)
if __name__ == '__main__':
    函数名('内容')# 调用
    
 #######################################################

import requests
def getHTMLText(url):
      try:
            r=requests.get(url,timeout=30)
            ''' 
		   res=requests.post(url,data=data,headers=headers)
            '''
            r.raise_for_status()  #如果状态不是200，引发HTMLError异常
            r.encoding=r.apparent_encoding
            return r.text
       except:
             return "产生异常"
if __name__=="__main__":
     url="http://www.baidu.com"
     print(getHTMLText(url))

bs4

from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
url = ''
headers = {
    'user-agent':'',
    'cookie':''
    'x-requested-with':''
    
}
req = urllib.request.Request(url,headers=headers)
resp = urllib.request.urlopen(req)
html_text = resp.read()

# 把字符串解析为html的文档
html = BeautifulSoup(html_text,'lxml')
with open('文件名','w','utf8') as f:
    f.write(html)

xpath

# -*- coding: utf-8 -*-
import requests
from lxml import etree
import os
if __name__ == '__main__':
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
     }
     url='http://sc.chinaz.com/jianli/free.html'
     page_text=requests.get(url=url,headers=headers).text
     tree = etree.HTML(page_text)
     div_list=tree.xpath('//div[@id="main"]/div/div')

     if not os.path.exists('./muban'):
         os.mkdir('./muban')

     url_list=[]
     name_list=[]
     for div in div_list:

         muban_href= div.xpath('./a/@href')[0]
         #print(muban_href)
         download_url = requests.get(url=muban_href,headers=headers).text
         #print(download_url)
         download_tree = etree.HTML(download_url)
         name=download_tree.xpath('//div[@class="bread clearfix"]/a[3]/text()')[0]+'.rar'
         download_name = name.encode('iso-8859-1').decode('utf-8')
         #print(download_name)
         download_url = download_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a/@href')
         url_list.append(download_url)
         name_list.append(download_name)

     for name,url in zip(name_list,url_list):
         print(name,url)



########################################
import request
from lxml import etree
class RequestError(Exception):
    pass

class ParseError(Exception):
    pass
def get(url):
    headers = {
        'user-agent':''
    }
    resp = request.get(url,headers=headers)
	if resp.status_code == 200:
        parse(resp.text)
	else:
        raise RequestError('请求失败')

def parse(html):
    root = etree.HTML(html)
    divs = root.xpath('//div')
    '''
    // 相对于整个文档
    ./相对于当前节点
    .//当前下标签的路径查找
    //title/text() 提取文本
    //img/@href 提取属性
    获取网页中的数据类型与字符集, 获取第一个`<meta>`标签


    '''

re

import re
import os
headers = {
    'user-agent':'',
    'cookie':''
    'x-requested-with':''
}
url = ''
resp = request.get(url=url,headers=headers)
resp.encoding = 'utf-8'
assert resp.status_code == 200
html = resp.text
with open('文件名.html','w',encoding='utf-8') as f:
    f.write(html)
compile_ = re.compile(r'<img src2="(.*?)" alt="(.*?)">')
compile_2 = re.compile(r'<img src="(.*?)" alt="(.*?)">')
imgs = compile_.findall(html)
if len(imgs) ==0:
    imgs = compile_2.findall(html)
print(len(imgs),imgs,sep='\n')
next_url = re.findall(r'<a href="(.*?)" class="nextpage">下一页</a>',html,re.S) #获取下一页

'''
正则
^,起始
,$末尾
\w 字母数字下划线
\b 匹配边界
\s 空格
\W 匹配任意不是字母数字下划线
\S 匹配任意不是空白字符的
\D 匹配任意非数字的字符
\B 匹配不是单词开头或结束的位置
'''

selenium

# 模板一
from selenium import webdriver
import unittest
from selenium.webdriver.support.wait import WebDriverWait

class TestLogin(unittest.TestCase):
    # 指定浏览器
    def setUp(self):
        self.driver = webdriver.Firefox(executable_path="F:\Program Files (x86)\Mozilla Firefox\geckodriver.exe")
    # 打开url
        self.driver.get("http://192.168.1.151:8080/login?from=%2F")

    # 登录操作
    def test_login(self):
        username = "test001"
        password = "pass001"

        # 执行登录操作
        #用户名的定位
        WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_id('j_username')).clear()
        WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_id('j_username')).send_keys(username)
        #密码的定位
        WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_name('j_password')).clear()
        WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_name('j_password')).send_keys(password)
        # 点击登录
        self.driver.find_element_by_id("yui-gen1-button").click()
        # 登录成功断言
        currUrl = self.driver.current_url
        print("currUrl"+currUrl)
        if currUrl == "http://192.168.1.151:8080/":
            print("success")
        else:
            print("failure")

    # 关闭浏览器
    def tearDown(self):
        self.driver.quit()


if __name__ == "__main__":
    unittest.main()

#####爬取招聘网
import re
import json
import time

import requests
from selenium import webdriver
from selenium.webdriver import Chrome, ActionChains
from  selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By
from selenium.webdriver.support import ui,expected_conditions


from utils.get_headers import user_agent

headers = user_agent()

chrome = Chrome(executable_path='chromedriver')

def get_all_city():
    url = 'https://www.zhaopin.com/citymap'
    resp = requests.get(url,headers=headers)
    if resp.status_code == 200:
        # resp.encoding = 'gbk'
        html = resp.text
        s = re.search(r'<script>__INITIAL_STATE__=(.*?)</script>',html)
        json_data = s.group()[0]
        data = json.loads(json_data)
        cityMapList = data['cityList']['cityMapList']
        for letter, citys in cityMapList.items():
            print(f'---{letter}---')
            for city in citys:
                yield city


def get_city_job(url):
    chrome.get(url)

    #找到搜索框

    search = chrome.find_element_by_xpath('//input[@class="zp-search__input"]')

    search.send_keys('python')
    #向右移动
    # chrome.execute_script('window.scrollTop(2000,document.body.scrollwidth)')
    btn = chrome.find_element_by_class_name('zp-search__btn--blue')
    btn.click()
    #浏览器打开第二个窗口
    chrome.switch_to.window(chrome.window_handles[1])
    time.sleep(10) #只能手动登录了
    divs = chrome.find_elements_by_class_name('iteminfo')
    for div in divs:

        title = div.find_element(By.XPATH,'.//span[@class="iteminfo__line1__jobname__name"]')
        salary = div.find_element(By.XPATH, './/p[@class="iteminfo__line2__jobdesc__salary"]')
        title = title.text
        salary = salary.text
    #     tags = chrome.find_element_by_xpath('div[@class="iteminfo__line3__welfare"]/div/text()')
    # for tag in tags:
    #     print(tag)
        print(title,salary)
    # divs = chrome.find_element_by_class_name('list')
    # for div in divs:
    #
    #
    #     job_title = div.find_element(By.XPATH,'.//a/div[1]/span/text()')
        # salary =
        # company =
        # print(job_title)
def get_city_jobs2(url):
    pass

if __name__ == '__main__':
    get_all_city()
    get_city_job('https://www.zhaopin.com/citymap')

爬视频

'''

1、拿到html 页面源代码
从源代码中提取到m3u8的url
下载m3u8
读取m3u8文件，下载视频
合并视频


'''

import requests
import re

obj = re.compile(r"url: '(?P<url>.*?)',",re.S)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"
}
url = 'https://www.91kanju.com/vod-play/59780-1-1.html'
resp = requests.get(url,headers=headers)

m3u8_url = obj.search(resp.text).group('url')
resp.close()
resp2 = requests.get(m3u8_url,headers=headers)
with open('./wanmei/wanmei.m3u8',mode='wb') as f:
    f.write(resp2.content)
resp2.close()
print('下载完了')


# 解析m3u8
n =1
with open('dianying/wanmei.m3u8', mode='r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line.startswith('#'):
            continue
        resp3 = requests.get(line)
        f = open(f"./dianying/{n}.ts",mode='wb')
        f.write(resp3.content)
        f.close()
        resp3.close()
        n +=1
print('完成')

posted @ 2022-03-01 08:20 #卧龙先生# 阅读(152) 评论(0) 收藏举报

我的凉妹鸭

本套是一个可以随时查看的爬虫模板，方便写爬虫时候用

1.本套是一个可以随时查看的爬虫模板，方便写爬虫时候用

urllib,request模型

bs4

xpath

re

selenium

爬视频

公告