python(爬取页面数据)

软件安装方式

pip install requests  lxml

1.爬取指定页面的相关信息

import requests                                 #页面地址请求模块
from lxml import etree                          #检索HTML数据信息
url = "https://nba.hupu.com/stats/players"                               #页面地址

headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; Trident/7.0; rv:11.0) like Gecko" }       #header用于安全密钥
resp = requests.get(url,headers = headers)                         #请求地址信息(源码)
#print(resp.text)
e = etree.HTML(resp.text)                                              #源码的HTML数据。定义对象
nos = e.xpath('//table[@class="players_table"]//tr/td[@width="46"]/text()')            #xpath模块过滤检索需要的数据
names = e.xpath('//table[@class="players_table"]//tr/td[2]/a/text()')
teams = e.xpath('//table[@class="players_table"]//tr/td[3]/a/text()')
scores = e.xpath('//table[@class="players_table"]//tr/td[@class="bg_b"]/text()')

print(teams)
with open('nba.txt','w',encoding='utf-8') as obj:

    for no,name,team,score in zip(nos[1:],names,teams,scores):                  #遍历需要的数据

        print(f'排名:{no} 姓名:{name} 球队:{team} 得分:{score}')
        obj.write(f'排名:{no} 姓名:{name} 球队:{team} 得分:{score}\n')            #将检索的数据存入指定文件中

2.爬虫页面图片

from traceback import print_tb
from lxml import etree
import requests
header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.37" }

#图片的json文件,network-XHR-herolist.json
hero_list_url = "https://pvp.qq.com/web201605/js/herolist.json"
hero_list_resp = requests.get(hero_list_url,headers = header)
print(hero_list_resp.json())

for h in hero_list_resp.json():
 #   print(h)
#    print(h.get('ename'))
    ename = h.get('ename')
 #   print(h.get('cname'))
    hero_url = f'https://pvp.qq.com/web201605/herodetail/{ename}.shtml'            #f会使‘’内的{}内的内容已变量形式使用
    hero_resp = requests.get(hero_url,headers = header)
    #定义hero_resp为国标形式,否则后面会打印乱码
    hero_resp.encoding='gbk'
    e = etree.HTML(hero_resp.text)
    #print(e)
    hero_names = e.xpath('//ul[@class="pic-pf-list pic-pf-list3"]/@data-imgname')[0]

    #打印hero_names以|隔开,并且检索name的‘&’号所在的索引位置,进行遍历
    names = [name[0:name.index('&')] for name in hero_names.split('|')]
    #下面的i为遍历次数,从0开始,哪位names的数组值
    for i,n in enumerate(names):
    #for i in range(1,len(names)):
        url = f'http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{ename}/{ename}-bigskin-{i+1}.jpg'
        resp = requests.get(url,headers = header)
        with open(f'{n}.jpg','wb') as obj:
            obj.write(resp.content)

3.页面爬取数据并生成flash页面

example one:

#app.py
# 让电脑可以支持服务访问 # pip install Flask #导入flash对象
from flask import Flask from flask import render_template from random import randint app = Flask(__name__) #创建Flash的对象,使他可以访问本地web页面127.0.0.1:5000 hero = ['黑暗之女','狂战士','正义巨像']
@app.route('/index') #设置网址定位并调用下面的index函数 def index(): #默认testcc.html需要放在当前目录的tempaltes目录下,调用默认tempaltes下的testcc.html return render_template('testcc.html',hero = hero) # return "hemll" @app.route('/chouqian') def chouqian(): num = randint(0,len(hero)-1) return render_template('testcc.html', hero = hero ,h = hero[num]) app.run(debug=True) #运行app对象,debug=true自动更新,便于开发修改代码
#testcc.html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document</title>
</head>
<body>
    {{ hero }}
    <a href="/chouqian">random draw</a><br/>          //点击randow draw,调用chouqian函数
    您抽到的是:{{ h }}
</body>
</html>

example two

#记录发送请求的地址
import requests
#解析数据模块
from lxml import etree
from flask import Flask,render_template,request
#创建一个可以支持web的应用
app = Flask(__name__)

def get_mobile(phone):

    url = f'https://www.ip138.com/mobile.asp?mobile={phone}&action=mobile'
    #伪装自己
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; Trident/7.0; rv:11.0) like Gecko'}
    #发送请求
    num = requests.get(url, headers = headers)
    #设置中文显示
    num.encoding = 'utf-8'
    #解析响应
    #print(num.text)
    #解析数据,把数据放入对象e中
    e = etree.HTML(num.text)
    #辨析xpath提取数据
    datas = e.xpath('//tr/td[2]/span/text()')
    return datas
#get_mobile(18934341244)

#建立路由
@app.route('/')
def index():
    return render_template('index1.html')

@app.route('/search_phone')
def search_phone():
#获取index1.html中的变量phone
    phone = request.args.get('phone')
    data = get_mobile(phone)
#['广东\xa0深圳市', '中国电信天翼卡', '0755', '518000']这种形式的数据不能直接return,需要换行转换一下
    return '<br/>'.join(data) 
    
#运行app的web服务
app.run(debug=True)
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document</title>
</head>
//输入text形式手机号 传参phone submit提交查询
//action调用函数
<body>

    <form action="/search_phone" method="get">
        手机号: <input type="text" name="phone" id="">
        <input type="submit" value="search">
    </form>
</body>
</html>

 

posted @ 2022-10-13 15:40  JASON_yul  阅读(620)  评论(0)    收藏  举报