python+selenium 爬取中国工业园网

import math
import re

import requests

from lxml import etree

type = "https://www.cnrepark.com/gyy/park{}/"
urlList = []
for i in range(1,8):
    url = type.format(i)
    urlList.append(url)


from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import  Options
options = Options()
options.add_argument('--headless')
br = webdriver.Chrome(chrome_options=options)


class ChanyeList(object):

    User_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    Refer = "https://www.cnrepark.com"

    # 初始化
    def __init__(self,br):
        self.br = br

    # 使用selenium下载内容
    def selenium_download(self,url):
        br.get(url)
        content = self.br.page_source
        return content

    def get_total_numbers(self,url):
        content = self.selenium_download(url)
        html = etree.HTML(content)
        totalNumbersTxt = html.xpath('.//div[@class="nw_num"]/text()')
        import re
        totalNumbers = re.findall(r'\d+',totalNumbersTxt[0])
        return totalNumbers[0]
    # 解析出列表
    def html_parse_list(self,content):
        html = etree.HTML(content)
        listObj = html.xpath('.//div[@class="area"]//div[@class="con_lst"]')
        list = []
        for item in listObj:
            src = item.xpath('./div/a/img/@src')
            href = item.xpath('./div//h2/a/@href')
            title = item.xpath('./div//h2/a/text()')
            list.append({'title':title[0],'href':href[0],"src":src[0]})
        return list

    def optimizeContent(self,res):
        res = res.replace('b\'', '')
        res = res.replace('\\n', '')
        res = res.replace('\'', '')
        return res

    # 解析出详情
    def html_parse_detail(self,content):
        html = etree.HTML(content)
        detail = html.xpath('.//div[@class="right_nr"]/div[1]//div[@class="kfq_box"]/ul')
        detail = etree.tostring(detail[0])
        detail = self.optimizeContent(str(detail))

        # 区域优势
        regionalAdvantages = html.xpath('.//div[@id="tbc_81"]')
        regionalAdvantages = etree.tostring(regionalAdvantages[0])
        regionalAdvantages = self.optimizeContent(str(regionalAdvantages))

        # 基础配套
        basicConfiguration = html.xpath('.//div[@id="tbc_82"]')
        basicConfiguration = etree.tostring(basicConfiguration[0])
        basicConfiguration = self.optimizeContent(str(basicConfiguration))

        # 优惠政策
        preferentialPolicy = html.xpath('.//div[@id="tbc_83"]')
        preferentialPolicy = etree.tostring(preferentialPolicy[0])
        preferentialPolicy = self.optimizeContent(str(preferentialPolicy))

        # 规划建设
        planningInformation = html.xpath('.//div[@id="tbc_84"]')
        planningInformation = etree.tostring(planningInformation[0])
        planningInformation = self.optimizeContent(str(planningInformation))





        res = {'detail': detail,
               "regionalAdvantages": regionalAdvantages,
               "basicConfiguration": basicConfiguration,
               "preferentialPolicy": preferentialPolicy,
               "planningInformation": planningInformation,
               }

        return res;


    def crawl_url(self,url):
        print("crawl page {}".format(url))
        listContent = self.selenium_download(url)
        list = self.html_parse_list(listContent)
        return list

    def get_name(self,index):
        nameList = [
            "特色园区",
            "创意园",
            "孵化基地",
          "商务园区",
            "生态园区",
           "综合乐园",
            "产业园转移区"
        ]
        return nameList[index]

    # 保存list  获取到详情 保存为html
    def save_list(self,list,type_index):
        try:
            for item in list:
                url = item['href']
                print("crawl url :"+url)
                content  = self.selenium_download(url)
                detailList = self.html_parse_detail(content)
                item['title'] = self.validateTitle(item['title'])
                type_name = self.get_name(type_index)
                with open("./txt/"+type_name+"-"+item['title']+".html","w") as f:
                    f.write("<h2>{}</h2>".format(item['title']))
                    f.write("<div> <a href='{}'><img style='height:80px;height:80px;' src={} /></a></div>".format(item['href'],item['src']))
                    f.write("<p>{}</p>".format(detailList['detail']))
                    f.write("<p>{}</p><h3>区位优势:</h3>{}".format(detailList['detail'],detailList['regionalAdvantages']))
                    f.write("<p>{}</p><h3>基础配套:</h3>{}".format(detailList['detail'],detailList['basicConfiguration']))
                    f.write("<p>{}</p><h3>优惠政策:</h3>{}".format(detailList['detail'],detailList['preferentialPolicy']))
                    f.write("<p>{}</p><h3>规划建设:</h3>{}".format(detailList['detail'],detailList['planningInformation']))
                    f.write("<br>")
                    f.close()
        except Exception as e:
            print("Exception:"+str(e))
    def validateTitle(self,title):
        rstr = r"[\/\\\:\*\?\"\<\>\|\(\)]"  # '/ \ : * ? " < > |'
        new_title = re.sub(rstr, "_", title)  # 替换为下划线
        return new_title
if __name__ == "__main__":
    try:

        rootUrl = "https://www.cnrepark.com/gyy/park{}/"

        for k in range(1,8):
            chanyeList = ChanyeList(br)
            baseUrl = "https://www.cnrepark.com/gyy/park"+str(k)+"/?publishtime=desc&page={}"
            pageUrl = "https://www.cnrepark.com/gyy/park"+str(k)+"/"
            # 获取总页数
            totalNumbers = chanyeList.get_total_numbers(pageUrl)
            totalPage = math.ceil( int(totalNumbers) / 13)
            result = []
            for page in range(1,int(totalPage) + 1 ):
                realUrl = baseUrl.format(page)
                list =  chanyeList.crawl_url(realUrl)
                result.extend(list)
            chanyeList.save_list(result,k-1)

        br.quit()
    except Exception as e:
        print(str(e))

 

posted @ 2020-03-12 16:52  brady-wang  阅读(496)  评论(0)    收藏  举报