爬取天眼查工商信息

import requests
import re
import json
import time
from xlrd import open_workbook
from xlutils.copy import copy


class TianYanChaSpider():
    def __init__(self,keyword):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"
        }
        self.keyword = keyword
        self.xlsx_file = r'business_datas.xlsx'

    def timeStamp(self,timeNum):
        if timeNum and timeNum > 0:
            try:
                timeStamp = float(timeNum / 1000)
                timeArray = time.localtime(timeStamp)
                otherStyleTime = time.strftime("%Y-%m-%d", timeArray)
                return otherStyleTime
            except:
                return ''
        else:
            return ''

    def start_url(self):
        url = 'https://www.tianyancha.com/search?key=' + self.keyword
        response = requests.get(url=url, headers=self.headers)
        detail_id = re.findall(r'https://www.tianyancha.com/company/(.*?)"', response.text)[0]  #默认取第一个
        detail_url = 'https://www.tianyancha.com/company/' + detail_id
        return detail_url

    def detail_context(self):
        url = self.start_url()
        detail_response = requests.get(url=url,headers=self.headers)
        json_data = re.findall(r'type="application/json">(.*?)</script></body></html>',detail_response.text)[0]
        data= json.loads(json_data)

        queries = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']
        name = queries.get('name','')                      # 公司名称
        creditCode = queries.get('creditCode','')          # 统一社会信用代码
        companyOrgType = queries.get('companyOrgType','')  # 企业类型
        taxAddress = queries.get('taxAddress','')          # 注册地址
        legalPersonName = queries.get('legalPersonName','')# 法定代表人
        regCapital = queries.get('regCapital','')          # 注册资本
        fromTime = self.timeStamp(queries.get('fromTime',''))   # 营业期限开始时间
        toTime = self.timeStamp(queries.get('toTime',''))       # 营业期限截止时间

        if toTime and fromTime:                            # 营业期限截止时间
            timeRange = fromTime + '' + toTime
        elif not fromTime and not toTime:
            timeRange = ''
        else:
            if fromTime:
                timeRange = fromTime + '' + '无固定期限'
            else:
                timeRange = ''
        estiblishTime = self.timeStamp(queries.get('estiblishTime',''))  # 成立时间
        businessScope = queries.get('businessScope','')     # 经营范围
        self.save_data(name,creditCode,companyOrgType,taxAddress,legalPersonName,regCapital,timeRange,estiblishTime,businessScope)


    def save_data(self,name,creditCode,companyOrgType,taxAddress,legalPersonName,regCapital,timeRange,estiblishTime,businessScope):
        # 获取源表内容
        r_xls = open_workbook(self.xlsx_file)  # 读取excel文件
        row = r_xls.sheets()[0].nrows      # 获取已有的行数
        excel = copy(r_xls)                # 将xlrd的对象转化为xlwt的对象
        table = excel.get_sheet(0)         # 获取要操作的sheet,默认是第一个表
                                           # 对excel表追加一行内容
        table.write(row, 0, name)          # 括号内分别为行数、列数、内容
        table.write(row, 1, creditCode)
        table.write(row, 2, companyOrgType)
        table.write(row, 3, taxAddress)
        table.write(row, 4, legalPersonName)
        table.write(row, 5, regCapital)
        table.write(row, 6, timeRange)
        table.write(row, 7, estiblishTime)
        table.write(row, 8, businessScope)
        # 保存并覆盖文件
        excel.save(self.xlsx_file)
        print('{}---保存成功'.format(name))


if __name__ == '__main__':
    with open('gs_lists_good.txt','r',encoding='gbk')as f:
        data = f.readlines()
    for i in data:
        keyword = i.strip()
        TianYanChaSpider(keyword).detail_context()

 

 注:本项目仅用于学习研究使用,请勿将本项目的任何内容用于商业或非法目的,否则后果自负。

posted @ 2022-09-16 08:58  lvye001  阅读(353)  评论(0编辑  收藏  举报