全国邮编爬取

import requests
from lxml import etree

def city_page(base_url):
    url = base_url+'post/'
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
    }
    response = requests.request('get', url=url, headers=headers)
    page_data = etree.HTML(response.text)
    data = page_data.xpath("//table[@id='quanguo']//tr")

    for infos in data:
        info = infos.xpath(".//td")
        for i in info:
            href_num = i.xpath("./a/@href")
            href_name = i.xpath("./a/text()")
            # print(href_name)
            # 因为最后一个元素为空所以需要判断列表值是否存在
            if href_num:
                href = href_num[0]
            # 地址乱码
            if href_name:
                dirname = href_name[0].encode('ISO-8859-1').decode('gbk')
            href_url = base_url+href
            # print(href_url)
            # print(dirname)
            post_code(href_url,dirname)


def post_code(base_url,dirname):
# def post_code():
    url = base_url
    # url = 'https://www.ip138.com/10/'
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
    }
    response = requests.request("get",url=url,headers=headers)
    # 设置解码格式
    response.encoding = 'gb2312'
    # print(response.text)
    page_data = etree.HTML(response.text)
    data = page_data.xpath("//table/tr[@bgcolor='#ffffff']")
    # print(data)
    for infos in data:
        info = infos.xpath("./td")
        # print(info)
        database = []
        for second_info in info:
            en_info = second_info.xpath("string()")
            # print(en_info)
            if en_info == '\xa0':
                continue
            database.append(en_info)
        with open('./邮编'+dirname+'.csv','a+',encoding='utf-8')as f:
            f.write(str(database))
        #     pass



if __name__ == '__main__':
    '''
    https://www.ip138.com/post/
    https://www.ip138.com/10/   北京邮编url

    '''
    base_url = 'https://www.ip138.com/'
    city_page(base_url)

 

posted @ 2020-03-23 09:13  momingQI  阅读(404)  评论(0)    收藏  举报