import os
import re
import zipfile
import logging
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
logging.basicConfig(level=logging.INFO,#控制台打印的日志级别
filename='new.log',
filemode='a',##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志,#a是追加模式,默认如果不写的话,就是追加模式
format= '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' #日志格式
)
def Readzip(file_name):
try:
z = zipfile.ZipFile(file_name, 'r')
# 打印zip文件中的文件列表
guokanzhiguang_folder = 'guokanzhiguang'
guokanzhiguang_list = []
for filename in z.namelist():
# sertch .txt
print(filename)
if filename.find(guokanzhiguang_folder) >= 0:
content = z.read(filename)
if len(content) == 0:
continue
guokanzhiguang_list.append(content)
return guokanzhiguang_list
except:
return 'Readzip Running Faild!!'
def getBookList(letter_lst):
try:for html in letter_lst:
soup = BeautifulSoup(html, 'html.parser')
tag1 = soup.find_all('div', attrs={'class': "book-result-item-warp"})
tag.append(tag1)
return tag
except:
return 'getBookList Running Faild!!'
def getBookElementInfo(letter_lst):
try:
alllist = getBookList(letter_lst)
if len(alllist) > 0:
print('文件个数:%d' % len(alllist))
alldetialbookinfolst = []
for lst in alllist :
for bookinfo in lst:
detialbookinfolst = []
center = bookinfo.find('div', attrs={'class': 'center'})
#杂志
title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]", "", center.find("div", attrs={"class": "title"}).get_text())
detialbookinfolst.append(title)
allinfo = center.findAll('div', attrs={'class': "info"})
#国家
country = allinfo[0].get_text()
detialbookinfolst.append(country[4:])
# 因子
factor = allinfo[1].find('span', class_='field').get_text()
ifs = allinfo[1].find('span', class_='ifs').get_text()
diff = allinfo[1].find('span', class_='diff').get_text()
detialbookinfolst.append(ifs + " " + diff)
# 周期
period = allinfo[2].get_text()
detialbookinfolst.append(period[4:])
# 占比
ratio = allinfo[3].get_text()
detialbookinfolst.append(ratio[6:])
# 地址
addre = allinfo[4].find('a').get('href')
detialbookinfolst.append(addre)
# 自引
cited_rate = allinfo[5].get_text()
detialbookinfolst.append(cited_rate[5:])
# print("+++++++++++++++++++++++++++++++++++")
alldetialbookinfolst.append(detialbookinfolst)
return alldetialbookinfolst
else:
print('txt文件不存在或内容为空!!!')
return ''
except:
return 'getBookElementInfo Running Faild!!'
def Insert2Excel(bookinfo):
# 插入数据
try:
tableTitle = ['杂志', '国家', '因子', '周期', '占比', '地址', '自引']
wb = Workbook()
ws = wb.active
ws.title = 'gk_sheet'
ws.append(tableTitle)
work_name = 'gkbookinfolist.xlsx'
for i in range(1, ws.max_column + 1):
ws.column_dimensions[get_column_letter(i)].width = 15
for info in bookinfo :
ws.append(info)
wb.save(work_name)
return 'Insert Excel succcessfully!'
except:
return 'Insert Excel failed!'
if __name__ == '__main__':
path = os.getcwd()
letter_lst = Readzip('bookinfo.zip')
bookinfo = getBookElementInfo(letter_lst)
#写excel
print(Insert2Excel(bookinfo))