猫途鹰简单爬虫正则巩固

import json
import re
from itertools import chain

import requests
from requests import RequestException

def get_page_index(url):
    try:
        response=requests.get(url)
        # print(response.text)
        if response.status_code==200:
            return response.text
        return None
    except RequestException:
        print('请求页不存在')
        return None
        # with open('猫途鹰.html','w+')as f:
        #     f.write(response.text)

def parse_one_page(html):
    # pattern=re.compile('<span\sclass="thumbCrop"><img\ssrc="(.*?)"')
    pattern_img=re.compile('<span.*?"thumbCrop".*?src="(.*?)"',re.S)
    imgs=re.findall(pattern_img,html)
    pattern_url = re.compile('<a.*?countryName.*?>(.*?)</a>.*?cityName.*?>(.*?)'
                             '</a>.*?stb blockIcon">(.*?)</span.*?cityGuide blockIcon">(.*?)'
                             '</span>.*?hotelsCount.*?html">(.*?)</a>.*?attractionCount.*?html">(.*?)'
                             '</a>.*?eateryCount.*?html">(.*?)</a>', re.S)
    a_url = re.findall(pattern_url, html)
    items =list(chain.from_iterable(zip(imgs,a_url)))
    items_list=[]
    for i in items:
        # print(i)
        items_list.append(i)
    return items_list
def write_to_file(content):
    '''
    写入文本
    :param content:
    :return:
    '''
    with open('猫途鹰.txt', 'a',encoding='utf-8')as f:#打开文件，如果没有就创建，encoding 指定编码方式
        f.write(json.dumps(content,ensure_ascii=False)+'\n')#ensure_ascii=False以指定的方式编码
def main():
    url='https://www.tripadvisor.cn/'
    html=get_page_index(url)
    # parse_one_page(html)
    # print(html)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

if __name__ == '__main__':
    main()
posted @ 2019-01-11 20:51 青春叛逆者阅读(344) 评论(0) 收藏举报
刷新页面返回顶部
青春叛逆者

猫途鹰简单爬虫正则巩固

公告