import requests
from lxml import etree
def city_page(base_url):
url = base_url+'post/'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
}
response = requests.request('get', url=url, headers=headers)
page_data = etree.HTML(response.text)
data = page_data.xpath("//table[@id='quanguo']//tr")
for infos in data:
info = infos.xpath(".//td")
for i in info:
href_num = i.xpath("./a/@href")
href_name = i.xpath("./a/text()")
# print(href_name)
# 因为最后一个元素为空所以需要判断列表值是否存在
if href_num:
href = href_num[0]
# 地址乱码
if href_name:
dirname = href_name[0].encode('ISO-8859-1').decode('gbk')
href_url = base_url+href
# print(href_url)
# print(dirname)
post_code(href_url,dirname)
def post_code(base_url,dirname):
# def post_code():
url = base_url
# url = 'https://www.ip138.com/10/'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
}
response = requests.request("get",url=url,headers=headers)
# 设置解码格式
response.encoding = 'gb2312'
# print(response.text)
page_data = etree.HTML(response.text)
data = page_data.xpath("//table/tr[@bgcolor='#ffffff']")
# print(data)
for infos in data:
info = infos.xpath("./td")
# print(info)
database = []
for second_info in info:
en_info = second_info.xpath("string()")
# print(en_info)
if en_info == '\xa0':
continue
database.append(en_info)
with open('./邮编'+dirname+'.csv','a+',encoding='utf-8')as f:
f.write(str(database))
# pass
if __name__ == '__main__':
'''
https://www.ip138.com/post/
https://www.ip138.com/10/ 北京邮编url
'''
base_url = 'https://www.ip138.com/'
city_page(base_url)