爬取12306

站点转换

根据查看12306网站的html可以发现,站点都是用代码进行代替的,如:北京的代码为“BJP”,而我们在实际输入的时候不可能再去查询站点的代码,所以需要将中文站点转换为对应的代码。

经查询,12306提供了一个站点与代码对应的网站,我们只需要爬取这个网站的数据,分析整理出对应规则即可。

import requests,re

url = "https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9053"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
response = requests.get(url,headers=headers)

经过比较,我们可以使用正则表达式进行分类,然后找出对应的关系

station_re = re.compile(r"'(.*)'")
station = station_re.findall(response.text)
l = "".join(station).split("|")
#print(l)

#中文名称
name = []
for i in range(1,len(l),5):
    name.append(l[i])
    
#站点代码
code = []
for i in range(2,len(l),5):
    code.append(l[i])

为了方便使用,将其放入字典中

#对应的字典
def to_dic():
    name_code = dict((name[i],code[i]) for i in range(len(name)))
    code_name = dict((code[i],name[i]) for i in range(len(name)))
    return name_code,code_name

完整代码

import requests,re

url = "https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9053"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
response = requests.get(url,headers=headers)

station_re = re.compile(r"'(.*)'")
station = station_re.findall(response.text)
l = "".join(station).split("|")
#print(l)

#中文名称
name = []
for i in range(1,len(l),5):
    name.append(l[i])

#站点代码
code = []
for i in range(2,len(l),5):
    code.append(l[i])

#print(code)

#对应的字典
def to_dic():
    name_code = dict((name[i],code[i]) for i in range(len(name)))
    code_name = dict((code[i],name[i]) for i in range(len(name)))
    return name_code,code_name


def search(a):
    '查询站点'
    d = to_dic()
    if a in d[0]:
        print(d[0][a])
    elif a in d[1]:
        print(d[1][a])
    else:
        print("不存在此站点")
    return None


if __name__ == "__main__":
    print("结束时请输入“结束”或“over”")
    while True:
        a = input("请输入要查询的站点:")
        if a == "结束" or a == "over":
            break
        else:
            search(a)

将其打包成一个py文件(station_name_code.py),方便后面爬取12306时调取。

爬取12306

爬取信息

分析网页可以看出,数据都被放在了json文件中

为避免错误,最好把浏览器中的User-Agent和cookie等一同传入

import requests
import json

def getDatas(year,month,date,_from,_to):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'kyfw.12306.cn',
        'Referer': 'https://www.12306.cn/',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-site',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        "Cookie": "_uab_collina=161396635873945822102574; JSESSIONID=631FB91746707366DCBBA5BE2C7453A1; BIGipServerotn=1106248202.24610.0000; RAIL_EXPIRATION=1614230065687; RAIL_DEVICEID=io7zYXeM-Gz8YUk5ZnOn0GjmqcUoFTj4kauIJForuflzNbyVp0X8kMyAxmqJYYJAhD8NCH3XBPNqdP3i225ICWhEvFl8Rr854pgCzjRbeBru9zok8pfJayROxlXHA4KMD4zV5AlBd8SqAbVVZbCvtviysCtLN_CH; BIGipServerpool_passport=165937674.50215.0000; route=c5c62a339e7744272a54643b3be5bf64; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_toStation=%u4E0A%u6D77%2CSHH; _jc_save_fromDate=2021-02-22; _jc_save_toDate=2021-02-22; _jc_save_wfdc_flag=dc",
    }
    urls = "https://kyfw.12306.cn/otn/leftTicket/queryZ?leftTicketDTO.train_date={}-{}-{}&leftTicketDTO.from_station={}&leftTicketDTO.to_station={}&purpose_codes=ADULT".format(str(year),str(month),str(date),_from,_to)
    # print(urls)
    res = requests.get(urls,headers=headers)
    if res.status_code == 200:
        # print(res.text)
        res = json.loads(res.text)
        datas = res["data"]["result"]
        # print(datas)     #页面的所有火车票信息(字符串列表)
        return datas
    else:
        print('网页未响应')
        return
    pass

删选信息

根据对比删选出所需要的信息,并归类整理便于使用。

#对单条信息删选,找出需要的信息
def _re(data):
    message = data.split("|")    #经过拆分可看出“预定”都是在第二个(索引为1)
    #print(message)
    return message    #单条火车信息组成的列表

#对信息归类整理
def prt(mes):
    a = mes[3]      #车次
    b1 = code(mes[4])     #出发站(更改为汉字)
    b2 = code(mes[7])     #到达站(更改为汉字)
    time1 = mes[8]  #出发时间
    time2 = mes[9]  #到达时间
    time3 = mes[10] #历时
    time4 = mes[13] #日期
    c1 = mes[32]    #商务座(特等座)
    c2 = mes[31]    #一等座
    c3 = mes[30]    #二等座
    c4 = mes[26]    #无座
    c5 = mes[23]    #软卧
    c6 = mes[28]    #硬卧
    c7 = mes[29]    #硬座
    l = [a,b1,b2,time1,time2,time3,time4,c1,c2,c3,c4,c5,c6,c7]
    for i in range(len(l)):
        if l[i]=="":
            l[i] = "无"
    dic0 = {"车次:":a,"出发站:":b1,"到达站:":b2,
         "出发时间:":time1,"到达时间:":time2,"历时:":time3,"日期:":time4,
         "商务座:":c1,"一等座:":c2,"二等座:":c3,"无座:":c4,"软卧:":c5,"硬卧:":c6,"硬座:":c7}
    list0 = ["车次:",l[0],"出发站:",l[1],"到达站:",l[2],
         "出发时间:",l[3]," 到达时间:",l[4],"历时:",l[5],"日期:",l[6],
         "商务座:",l[7],"一等座:",l[8],"二等座:",l[9],"无座:",l[10],"软卧:",l[11],"硬卧:",l[12],"硬座:",l[13]]
    return list0

此时,所需要的车次,余票等信息都已找出。

保存

将爬取的信息保存到本地(可保存为txt、excel、json或数据库等)

### 保存为txt
#文本返回保存(将数据(列表)转换称可被保存的格式)
def txt_format(l):  
    '文本格式优化'
    for i in range(1,len(l),2):
        l[i] = "{0:{1}<5}\t".format(l[i],chr(12288))    #汉字的半角问题
    l = "".join(l)
    return l  

def save_txt(m):    #保存为txt文件或直接输出
    with open(path+"火车票.txt","a") as f:
        f.write(m)
        f.write('\r\n')
        f.write('\r\n')
        
def go_txt(year,month,date,_from,_to):   #保存为txt
    datas = getDatas(year,month,date,_from,_to)

    for data in datas:
        mes = _re(data)         #找出需要的信息
        prt_txt = prt(mes)      #对信息归类整理
        m = txt_format(prt_txt)
        # print(m,"\n")   #返回到屏幕
        save_txt(m)    #保存为txt
### 保存为excel
import openpyxl
import os

#保存为excel
def save_excel(l,i):  #保存为excel文件
    while os.path.exists(path+"火车票.xlsx") == False:		#若文件不存在则创建
        wb = openpyxl.Workbook()
        sheet = wb.active
        sheet.title = "火车票信息"
        wb.save(path+"火车票.xlsx")
    wb = openpyxl.load_workbook(path+"火车票.xlsx")	#读取excel表
    sheet = wb["火车票信息"]
    if sheet["A1"].value == None:       #首行,用于标识对应的数据
        sheet["A1"] = l[0].replace(":","")
        sheet["B1"] = l[2].replace(":","")
        sheet["C1"] = l[4].replace(":","")
        sheet["D1"] = l[6].replace(":","")
        sheet["E1"] = l[8].replace(":","")
        sheet["F1"] = l[10].replace(":","")
        sheet["G1"] = l[12].replace(":","")
        sheet["H1"] = l[14].replace(":","")
        sheet["I1"] = l[16].replace(":","")
        sheet["J1"] = l[18].replace(":","")
        sheet["K1"] = l[20].replace(":","")
        sheet["L1"] = l[22].replace(":", "")
        sheet["M1"] = l[24].replace(":", "")
        sheet["N1"] = l[26].replace(":", "")
    i = str(i)
    sheet["A"+i] = l[1]
    sheet["B"+i] = l[3]
    sheet["C"+i] = l[5]
    sheet["D"+i] = l[7]
    sheet["E"+i] = l[9]
    sheet["F"+i] = l[11]
    sheet["G"+i] = l[13]
    sheet["H"+i] = l[15]
    sheet["I"+i] = l[17]
    sheet["J"+i] = l[19]
    sheet["K"+i] = l[21]
    sheet["L"+i] = l[23]
    sheet["M"+i] = l[25]
    sheet["N"+i] = l[27]
    wb.save(path+"火车票.xlsx")
    pass

def go_excel(year,month,date,_from,_to):     #保存为excel
    datas = getDatas(year,month,date,_from,_to)
    i = 2
    for data in datas:
        mes = _re(data)
        prt_txt = prt(mes)
        save_excel(prt_txt,i)   #保存为excel
        i += 2

其它

  1. 汉字编码转换

    我们在终端输入的是中文,而网站需要的是站点代码,所以需要进行转换。就用到了开篇的"站点转换"。

    # 导入站点转换的py文件
    import station_name_code
    
    #汉字编码转换
    def code(name):
        name_code = station_name_code.to_dic()[0]
        code_name = station_name_code.to_dic()[1]
        if name in name_code:
            return name_code[name]
        elif name in code_name:
            return code_name[name]
        else:
            print("站点不存在")
    
  2. 选择路径

    对于不同用户,存放文件的路径的需求是不同的,所以额外设置一个可以选择路径的功能。

    import tkinter as tk
    from tkinter import filedialog
    
    def select_path():
        '选择路径'
    
        root = tk.Tk()
        root.withdraw()
    
        folderpath = filedialog.askdirectory()
        path = folderpath + '/'
    
        return path
    

整体代码

#!-*-coding:utf-8 -*-
# python3.7
# @Author:fuq666@qq.com
# Update time:2020-09-10
# Filename:爬取火车票信息

"""
实现功能:选择时间和始终站(中文),可查出该日期的所有车次及余票,结果可返回为txt或excel文件。

待改进:1.强制输入站点为中文,或站点代码
        2.可选时间的限制,相对于当前时间(一个月后则无效?)
        3.增加购票系统?
        4.变为GUI桌面程序
"""

import requests,openpyxl
import os,json,re,time
import station_name_code
import tkinter as tk
from tkinter import filedialog

def getDatas(year,month,date,_from,_to):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'kyfw.12306.cn',
        'Referer': 'https://www.12306.cn/',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-site',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        "Cookie": "_uab_collina=161396635873945822102574; JSESSIONID=631FB91746707366DCBBA5BE2C7453A1; BIGipServerotn=1106248202.24610.0000; RAIL_EXPIRATION=1614230065687; RAIL_DEVICEID=io7zYXeM-Gz8YUk5ZnOn0GjmqcUoFTj4kauIJForuflzNbyVp0X8kMyAxmqJYYJAhD8NCH3XBPNqdP3i225ICWhEvFl8Rr854pgCzjRbeBru9zok8pfJayROxlXHA4KMD4zV5AlBd8SqAbVVZbCvtviysCtLN_CH; BIGipServerpool_passport=165937674.50215.0000; route=c5c62a339e7744272a54643b3be5bf64; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_toStation=%u4E0A%u6D77%2CSHH; _jc_save_fromDate=2021-02-22; _jc_save_toDate=2021-02-22; _jc_save_wfdc_flag=dc",
    }
    urls = "https://kyfw.12306.cn/otn/leftTicket/queryZ?leftTicketDTO.train_date="+str(year)+"-"+str(month)+"-"+str(date)+"&leftTicketDTO.from_station="+_from+"&leftTicketDTO.to_station="+_to+"&purpose_codes=ADULT"
    # print(urls)
    res = requests.get(urls,headers=headers)
    if res.status_code == 200:
        # print(res.text)
        res = json.loads(res.text)
        datas = res["data"]["result"]
        # print(datas)     #页面的所有火车票信息(字符串列表)
        return datas
    else:
        print('网页未响应')
        return
    pass

#对单条信息删选,找出需要的信息
def _re(data):
    '删选每条信息'
    message = data.split("|")    #经过拆分可看出“预定”都是在第二个(索引为1)
    #print(message)
    return message    #单条火车信息组成的列表

#对信息归类整理
def prt(mes):
    '对应归类整理'
    a = mes[3]      #车次
    b1 = code(mes[4])     #出发站(更改为汉字)
    b2 = code(mes[7])     #到达站(更改为汉字)
    time1 = mes[8]  #出发时间
    time2 = mes[9]  #到达时间
    time3 = mes[10] #历时
    time4 = mes[13] #日期
    c1 = mes[32]    #商务座(特等座)
    c2 = mes[31]    #一等座
    c3 = mes[30]    #二等座
    c4 = mes[26]    #无座
    c5 = mes[23]    #软卧
    c6 = mes[28]    #硬卧
    c7 = mes[29]    #硬座
    l = [a,b1,b2,time1,time2,time3,time4,c1,c2,c3,c4,c5,c6,c7]
    for i in range(len(l)):
        if l[i]=="":
            l[i] = "无"
    dic0 = {"车次:":a,"出发站:":b1,"到达站:":b2,
         "出发时间:":time1,"到达时间:":time2,"历时:":time3,"日期:":time4,
         "商务座:":c1,"一等座:":c2,"二等座:":c3,"无座:":c4,"软卧:":c5,"硬卧:":c6,"硬座:":c7}
    list0 = ["车次:",l[0],"出发站:",l[1],"到达站:",l[2],
         "出发时间:",l[3]," 到达时间:",l[4],"历时:",l[5],"日期:",l[6],
         "商务座:",l[7],"一等座:",l[8],"二等座:",l[9],"无座:",l[10],"软卧:",l[11],"硬卧:",l[12],"硬座:",l[13]]
    return list0

#文本返回保存(将数据(列表)转换称可被保存的格式)
def txt_format(l):
    '文本格式优化'
    for i in range(1,len(l),2):
        l[i] = "{0:{1}<5}\t".format(l[i],chr(12288))    #汉字的半角问题
    l = "".join(l)
    return l  

#保存为txt
def save_txt(m):
    '保存为txt文件或直接输出'
    with open(path+"火车票.txt","a") as f:
        f.write(m)
        f.write('\r\n')
        f.write('\r\n')

#保存为excel
def save_excel(l,i):
    '保存为excel文件'
    while os.path.exists(path+"火车票.xlsx") == False:
        wb = openpyxl.Workbook()
        sheet = wb.active
        sheet.title = "火车票信息"
        wb.save(path+"火车票.xlsx")
    wb = openpyxl.load_workbook(path+"火车票.xlsx")
    sheet = wb["火车票信息"]
    if sheet["A1"].value == None:       #首行
        sheet["A1"] = l[0].replace(":","")
        sheet["B1"] = l[2].replace(":","")
        sheet["C1"] = l[4].replace(":","")
        sheet["D1"] = l[6].replace(":","")
        sheet["E1"] = l[8].replace(":","")
        sheet["F1"] = l[10].replace(":","")
        sheet["G1"] = l[12].replace(":","")
        sheet["H1"] = l[14].replace(":","")
        sheet["I1"] = l[16].replace(":","")
        sheet["J1"] = l[18].replace(":","")
        sheet["K1"] = l[20].replace(":","")
        sheet["L1"] = l[22].replace(":", "")
        sheet["M1"] = l[24].replace(":", "")
        sheet["N1"] = l[26].replace(":", "")
    i = str(i)
    sheet["A"+i] = l[1]
    sheet["B"+i] = l[3]
    sheet["C"+i] = l[5]
    sheet["D"+i] = l[7]
    sheet["E"+i] = l[9]
    sheet["F"+i] = l[11]
    sheet["G"+i] = l[13]
    sheet["H"+i] = l[15]
    sheet["I"+i] = l[17]
    sheet["J"+i] = l[19]
    sheet["K"+i] = l[21]
    sheet["L"+i] = l[23]
    sheet["M"+i] = l[25]
    sheet["N"+i] = l[27]
    wb.save(path+"火车票.xlsx")
    pass

## 运行顺序
def go_txt(year,month,date,_from,_to):
    '保存为txt'
    datas = getDatas(year,month,date,_from,_to)

    for data in datas:
        mes = _re(data)         #找出需要的信息
        prt_txt = prt(mes)      #对信息归类整理
        m = txt_format(prt_txt)
        print(m,"\n")   #返回到屏幕
        save_txt(m)    #保存为txt

def go_excel(year,month,date,_from,_to):
    '保存为excel'
    datas = getDatas(year,month,date,_from,_to)
    i = 2
    for data in datas:
        mes = _re(data)
        prt_txt = prt(mes)
        save_excel(prt_txt,i)
        i += 2

#站点编码转换
def code(name):
    '站点编码转换'
    d = station_name_code.to_dic()
    name_code = d[0]
    code_name = d[1]
    if name in name_code:
        return name_code[name]
    elif name in code_name:
        return code_name[name]
    else:
        print("站点不存在")

#选择保存路径
def select_path():
    '选择保存路径'
    import tkinter as tk
    from tkinter import filedialog

    root = tk.Tk()
    root.withdraw()

    folderpath = filedialog.askdirectory()
    path = folderpath + '/'

    return path

#主程序
def main():
    year=input("输入年份(如:2020):")
    month=input("输入月份(如:01):")
    date=input("输入日期(如:01):")
    _from=input("输入出发站(如:北京):")
    _to=input("输入到达站(如:武汉):")
    choice=int(input("返回至屏幕和txt文件输入“0”,返回至excel文件输入“1”:"))
    # year = 2021
    # month = '02'
    # date = 20
    # _from = '北京'
    # _to = '武汉'
    # choice = 0
    _from = code(_from)     #转换
    _to = code(_to)
    if choice == 0:
        try:
            go_txt(year,month,date,_from,_to)    #txt文件和(或)返回到屏幕
        except:
            print("未找到")
    else:
        try:
            go_excel(year,month,date,_from,_to)     #excel表格
        except:
            print("未找到")

if __name__ == "__main__":
    print('请选择结果保存路径:')
    time.sleep(0.5)
    path = select_path()
    # path = r'C:/Users/15394/Desktop/'
    main()
    print('保存在{}'.format(path))
    print("Done")

  • 根据自己的浏览器信息更改cookie数据
posted @ 2021-02-22 14:25  F___Q  阅读(310)  评论(0)    收藏  举报