python(爬取页面数据)
软件安装方式
pip install requests lxml
1.爬取指定页面的相关信息
import requests #页面地址请求模块
from lxml import etree #检索HTML数据信息
url = "https://nba.hupu.com/stats/players" #页面地址
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; Trident/7.0; rv:11.0) like Gecko" } #header用于安全密钥
resp = requests.get(url,headers = headers) #请求地址信息(源码)
#print(resp.text)
e = etree.HTML(resp.text) #源码的HTML数据。定义对象
nos = e.xpath('//table[@class="players_table"]//tr/td[@width="46"]/text()') #xpath模块过滤检索需要的数据
names = e.xpath('//table[@class="players_table"]//tr/td[2]/a/text()')
teams = e.xpath('//table[@class="players_table"]//tr/td[3]/a/text()')
scores = e.xpath('//table[@class="players_table"]//tr/td[@class="bg_b"]/text()')
print(teams)
with open('nba.txt','w',encoding='utf-8') as obj:
for no,name,team,score in zip(nos[1:],names,teams,scores): #遍历需要的数据
print(f'排名:{no} 姓名:{name} 球队:{team} 得分:{score}')
obj.write(f'排名:{no} 姓名:{name} 球队:{team} 得分:{score}\n') #将检索的数据存入指定文件中
2.爬虫页面图片
from traceback import print_tb
from lxml import etree
import requests
header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.37" }
#图片的json文件,network-XHR-herolist.json
hero_list_url = "https://pvp.qq.com/web201605/js/herolist.json"
hero_list_resp = requests.get(hero_list_url,headers = header)
print(hero_list_resp.json())
for h in hero_list_resp.json():
# print(h)
# print(h.get('ename'))
ename = h.get('ename')
# print(h.get('cname'))
hero_url = f'https://pvp.qq.com/web201605/herodetail/{ename}.shtml' #f会使‘’内的{}内的内容已变量形式使用
hero_resp = requests.get(hero_url,headers = header)
#定义hero_resp为国标形式,否则后面会打印乱码
hero_resp.encoding='gbk'
e = etree.HTML(hero_resp.text)
#print(e)
hero_names = e.xpath('//ul[@class="pic-pf-list pic-pf-list3"]/@data-imgname')[0]
#打印hero_names以|隔开,并且检索name的‘&’号所在的索引位置,进行遍历
names = [name[0:name.index('&')] for name in hero_names.split('|')]
#下面的i为遍历次数,从0开始,哪位names的数组值
for i,n in enumerate(names):
#for i in range(1,len(names)):
url = f'http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{ename}/{ename}-bigskin-{i+1}.jpg'
resp = requests.get(url,headers = header)
with open(f'{n}.jpg','wb') as obj:
obj.write(resp.content)
3.页面爬取数据并生成flash页面
example one:
#app.py
# 让电脑可以支持服务访问 # pip install Flask #导入flash对象
from flask import Flask from flask import render_template from random import randint app = Flask(__name__) #创建Flash的对象,使他可以访问本地web页面127.0.0.1:5000 hero = ['黑暗之女','狂战士','正义巨像']
@app.route('/index') #设置网址定位并调用下面的index函数 def index(): #默认testcc.html需要放在当前目录的tempaltes目录下,调用默认tempaltes下的testcc.html return render_template('testcc.html',hero = hero) # return "hemll" @app.route('/chouqian') def chouqian(): num = randint(0,len(hero)-1) return render_template('testcc.html', hero = hero ,h = hero[num]) app.run(debug=True) #运行app对象,debug=true自动更新,便于开发修改代码
#testcc.html <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Document</title> </head> <body> {{ hero }} <a href="/chouqian">random draw</a><br/> //点击randow draw,调用chouqian函数 您抽到的是:{{ h }} </body> </html>
example two
#记录发送请求的地址
import requests
#解析数据模块
from lxml import etree
from flask import Flask,render_template,request
#创建一个可以支持web的应用
app = Flask(__name__)
def get_mobile(phone):
url = f'https://www.ip138.com/mobile.asp?mobile={phone}&action=mobile'
#伪装自己
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; Trident/7.0; rv:11.0) like Gecko'}
#发送请求
num = requests.get(url, headers = headers)
#设置中文显示
num.encoding = 'utf-8'
#解析响应
#print(num.text)
#解析数据,把数据放入对象e中
e = etree.HTML(num.text)
#辨析xpath提取数据
datas = e.xpath('//tr/td[2]/span/text()')
return datas
#get_mobile(18934341244)
#建立路由
@app.route('/')
def index():
return render_template('index1.html')
@app.route('/search_phone')
def search_phone():
#获取index1.html中的变量phone
phone = request.args.get('phone')
data = get_mobile(phone)
#['广东\xa0深圳市', '中国电信天翼卡', '0755', '518000']这种形式的数据不能直接return,需要换行转换一下
return '<br/>'.join(data)
#运行app的web服务
app.run(debug=True)
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Document</title> </head> //输入text形式手机号 传参phone submit提交查询 //action调用函数 <body> <form action="/search_phone" method="get"> 手机号: <input type="text" name="phone" id=""> <input type="submit" value="search"> </form> </body> </html>
浙公网安备 33010602011771号