本套是一个可以随时查看的爬虫模板,方便写爬虫时候用
1.本套是一个可以随时查看的爬虫模板,方便写爬虫时候用
urllib,request模型
from urlli.request import Request, urlopen
from urllib.paarse import urlencode
url = ''
headers = {
'user-agent':'',
'cookie':''
'x-requested-with':''
}
def 函数名():
request = Request(url=url,headers=headers)
response = urlopen(request) #发起请求
assert response.code == 200
resp_text = response.read() # 读取数据
with open('文件名','wb') as f: # 写入数据,如果是图片写wb,如果是文本,w即可
f.write(resp_text)
if __name__ == '__main__':
函数名('内容')# 调用
#######################################################
import requests
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
'''
res=requests.post(url,data=data,headers=headers)
'''
r.raise_for_status() #如果状态不是200,引发HTMLError异常
r.encoding=r.apparent_encoding
return r.text
except:
return "产生异常"
if __name__=="__main__":
url="http://www.baidu.com"
print(getHTMLText(url))
bs4
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
url = ''
headers = {
'user-agent':'',
'cookie':''
'x-requested-with':''
}
req = urllib.request.Request(url,headers=headers)
resp = urllib.request.urlopen(req)
html_text = resp.read()
# 把字符串解析为html的文档
html = BeautifulSoup(html_text,'lxml')
with open('文件名','w','utf8') as f:
f.write(html)
xpath
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import os
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}
url='http://sc.chinaz.com/jianli/free.html'
page_text=requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
div_list=tree.xpath('//div[@id="main"]/div/div')
if not os.path.exists('./muban'):
os.mkdir('./muban')
url_list=[]
name_list=[]
for div in div_list:
muban_href= div.xpath('./a/@href')[0]
#print(muban_href)
download_url = requests.get(url=muban_href,headers=headers).text
#print(download_url)
download_tree = etree.HTML(download_url)
name=download_tree.xpath('//div[@class="bread clearfix"]/a[3]/text()')[0]+'.rar'
download_name = name.encode('iso-8859-1').decode('utf-8')
#print(download_name)
download_url = download_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a/@href')
url_list.append(download_url)
name_list.append(download_name)
for name,url in zip(name_list,url_list):
print(name,url)
########################################
import request
from lxml import etree
class RequestError(Exception):
pass
class ParseError(Exception):
pass
def get(url):
headers = {
'user-agent':''
}
resp = request.get(url,headers=headers)
if resp.status_code == 200:
parse(resp.text)
else:
raise RequestError('请求失败')
def parse(html):
root = etree.HTML(html)
divs = root.xpath('//div')
'''
// 相对于整个文档
./相对于当前节点
.//当前下标签的路径查找
//title/text() 提取文本
//img/@href 提取属性
获取网页中的数据类型与字符集, 获取第一个`<meta>`标签
'''
re
import re
import os
headers = {
'user-agent':'',
'cookie':''
'x-requested-with':''
}
url = ''
resp = request.get(url=url,headers=headers)
resp.encoding = 'utf-8'
assert resp.status_code == 200
html = resp.text
with open('文件名.html','w',encoding='utf-8') as f:
f.write(html)
compile_ = re.compile(r'<img src2="(.*?)" alt="(.*?)">')
compile_2 = re.compile(r'<img src="(.*?)" alt="(.*?)">')
imgs = compile_.findall(html)
if len(imgs) ==0:
imgs = compile_2.findall(html)
print(len(imgs),imgs,sep='\n')
next_url = re.findall(r'<a href="(.*?)" class="nextpage">下一页</a>',html,re.S) #获取下一页
'''
正则
^,起始
,$末尾
\w 字母数字下划线
\b 匹配边界
\s 空格
\W 匹配任意不是字母数字下划线
\S 匹配任意不是空白字符的
\D 匹配任意非数字的字符
\B 匹配不是单词开头或结束的位置
'''
selenium
# 模板一
from selenium import webdriver
import unittest
from selenium.webdriver.support.wait import WebDriverWait
class TestLogin(unittest.TestCase):
# 指定浏览器
def setUp(self):
self.driver = webdriver.Firefox(executable_path="F:\Program Files (x86)\Mozilla Firefox\geckodriver.exe")
# 打开url
self.driver.get("http://192.168.1.151:8080/login?from=%2F")
# 登录操作
def test_login(self):
username = "test001"
password = "pass001"
# 执行登录操作
#用户名的定位
WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_id('j_username')).clear()
WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_id('j_username')).send_keys(username)
#密码的定位
WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_name('j_password')).clear()
WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_name('j_password')).send_keys(password)
# 点击登录
self.driver.find_element_by_id("yui-gen1-button").click()
# 登录成功断言
currUrl = self.driver.current_url
print("currUrl"+currUrl)
if currUrl == "http://192.168.1.151:8080/":
print("success")
else:
print("failure")
# 关闭浏览器
def tearDown(self):
self.driver.quit()
if __name__ == "__main__":
unittest.main()
#####爬取招聘网
import re
import json
import time
import requests
from selenium import webdriver
from selenium.webdriver import Chrome, ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By
from selenium.webdriver.support import ui,expected_conditions
from utils.get_headers import user_agent
headers = user_agent()
chrome = Chrome(executable_path='chromedriver')
def get_all_city():
url = 'https://www.zhaopin.com/citymap'
resp = requests.get(url,headers=headers)
if resp.status_code == 200:
# resp.encoding = 'gbk'
html = resp.text
s = re.search(r'<script>__INITIAL_STATE__=(.*?)</script>',html)
json_data = s.group()[0]
data = json.loads(json_data)
cityMapList = data['cityList']['cityMapList']
for letter, citys in cityMapList.items():
print(f'---{letter}---')
for city in citys:
yield city
def get_city_job(url):
chrome.get(url)
#找到搜索框
search = chrome.find_element_by_xpath('//input[@class="zp-search__input"]')
search.send_keys('python')
#向右移动
# chrome.execute_script('window.scrollTop(2000,document.body.scrollwidth)')
btn = chrome.find_element_by_class_name('zp-search__btn--blue')
btn.click()
#浏览器打开第二个窗口
chrome.switch_to.window(chrome.window_handles[1])
time.sleep(10) #只能手动登录了
divs = chrome.find_elements_by_class_name('iteminfo')
for div in divs:
title = div.find_element(By.XPATH,'.//span[@class="iteminfo__line1__jobname__name"]')
salary = div.find_element(By.XPATH, './/p[@class="iteminfo__line2__jobdesc__salary"]')
title = title.text
salary = salary.text
# tags = chrome.find_element_by_xpath('div[@class="iteminfo__line3__welfare"]/div/text()')
# for tag in tags:
# print(tag)
print(title,salary)
# divs = chrome.find_element_by_class_name('list')
# for div in divs:
#
#
# job_title = div.find_element(By.XPATH,'.//a/div[1]/span/text()')
# salary =
# company =
# print(job_title)
def get_city_jobs2(url):
pass
if __name__ == '__main__':
get_all_city()
get_city_job('https://www.zhaopin.com/citymap')
爬视频
``
'''
1、拿到html 页面源代码
从源代码中提取到m3u8的url
下载m3u8
读取m3u8文件,下载视频
合并视频
'''
import requests
import re
obj = re.compile(r"url: '(?P<url>.*?)',",re.S)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"
}
url = 'https://www.91kanju.com/vod-play/59780-1-1.html'
resp = requests.get(url,headers=headers)
m3u8_url = obj.search(resp.text).group('url')
resp.close()
resp2 = requests.get(m3u8_url,headers=headers)
with open('./wanmei/wanmei.m3u8',mode='wb') as f:
f.write(resp2.content)
resp2.close()
print('下载完了')
# 解析m3u8
n =1
with open('dianying/wanmei.m3u8', mode='r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith('#'):
continue
resp3 = requests.get(line)
f = open(f"./dianying/{n}.ts",mode='wb')
f.write(resp3.content)
f.close()
resp3.close()
n +=1
print('完成')