爬数字报

  1 """
  2 爬取当天数字报全部文章内容，写入txt文档中
  3 """
  4 
  5 import requests
  6 from lxml import etree
  7 import datetime,sys
  8 
  9 
 10 def today_index():  # 构造数字报链接
 11     today = str(datetime.date.today()).split('-')  # 获取今天日期,以‘-’分割
 12     web_today = today[0] + today[1]+'/'+today[2]  # 格式如  202106/22/
 13     base_url = 'http://www.XX.com.cn/pc/layout/'+web_today+'/node_01.html'
 14     print(1111111)
 15     return [web_today, base_url]
 16     
 17 
 18 
 19 def input_date():
 20     print(2222222222)
 21 
 22 def downloadHtml(url):  # 发起访问请求，获取页面,可重复使用
 23     headers = {
 24         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36',
 25         'Cookie': 'AD_RS_COOKIE=20080918; _trs_uv=kahvgie3_6_fc6v'
 26     }
 27     response = requests.get(url=url, headers=headers, timeout=20)
 28     # print(response)
 29     response.encoding = 'utf-8'
 30     if response.status_code != 200:
 31         print('没有获取到页面，是否还未上传？')
 32         sys.exit(0)
 33         

 34     else:
 35         html = etree.HTML(response.text)
 36         return html
 37 
 38 
 39 def get_index(html):   # 获取首页信息
 40     Chunkiconlist = html.xpath("//div[@class='Chunkiconlist']//p")
 41     layout_name = html.xpath("//div[@class='Chunkiconlist']//p/a[1]/text()")  # 每一版的名字
 42     num = len(Chunkiconlist)
 43     return [num, layout_name]  # 返回版数以及名称
 44 
 45 
 46 def get_article_urls(html):
 47     hrefs = html.xpath("//div[@class='newslist']//ul//li/h3/a/@href")
 48     titles = html.xpath("//div[@class='newslist']//ul//li/h3/a/text()")
 49     article_urls = []
 50     for i in hrefs:
 51         i = 'http://paper.zgxxb.com.cn/pc'+str(i)[8:]  # 文章链接
 52         # print(i)
 53         article_urls.append(i)
 54     return article_urls
 55 
 56 
 57 def get_article_content(html):  # 读一篇文章
 58 
 59     pre_list = html.xpath("//div[@class='newsdetatit']/p[1]/text()")  # 扩展字段
 60     sub_list = html.xpath("//div[@class='newsdetatit']/p[2]/text()")  # 副标题
 61     if len(pre_list):
 62         pretitle = '扩展字段：'+pre_list[0]
 63     else:
 64         pretitle = ''
 65 
 66     if len(sub_list):
 67         subtitle = '副标题，正文居中：'+sub_list[0]
 68     else:
 69         subtitle = ''
 70 
 71     title = html.xpath("//div[@class='newsdetatit']/h3/text()")[0]
 72     p = html.xpath("//div[@id='ozoom']//p")
 73     words = ''
 74 
 75     for i in p:
 76         j = (i.xpath('string(.)'))
 77         words = words + str(j)
 78         article_content = '\n'+pretitle+'\n'+title + \
 79             '\n'+subtitle+'\n'+words+'\n\n'+30*' - '
 80 
 81     #print(article_content)
 82     return article_content
 83 
 84 
 85 def clear():  # 清空文档
 86     paper_file = './paper.txt'
 87     with open(paper_file, 'w', encoding='utf-8') as f:
 88         f.truncate()
 89 
 90 
 91 def write_words(words):  # 写入内容
 92     paper_file = './paper.txt'
 93     with open(paper_file, 'a', encoding='utf-8') as f:  # 追加模式写
 94         f.write(words)
 95 
 96 
 97 def one_layout(article_urls):  # 读每一版
 98     for i in article_urls:
 99         article_htmls = downloadHtml(i)
100         words = get_article_content(article_htmls)
101         write_words(words)
102 
103 
104 def choice():
105     l = [1,2]
106     print("\n是否抓取今天的数字报内容？")
107     answer = int(input('1:是\n2:另行输入日期\n\n'))
108     funs = {1:today_index,2:input_date}
109     if answer in l:    
110         return funs[answer]()
111     else:
112         print('请重新输入:\n')
113         choice()
114         
115     
116
120 
121 base_url = today_index()[1]  # 首页链接
122 web_today = today_index()[0]
123 index_html = downloadHtml(base_url)
124 
125 num = get_index(index_html)[0]  # 版数
126 layout_names = get_index(index_html)[1]  # 每一版名称
127 
128 
129 
130 
131 
132 clear()
133 print('start\n')
134 i = 1  # 从第一版开始，循环遍历
135 while i <= num:
136     url = 'http://www.xx.com.cn/pc/layout/'+web_today+'/node_0'+str(i)+'.html'
137     
138     layout_html = downloadHtml(url)
139     article_urls = get_article_urls(layout_html)
140     write_words('\n'+layout_names[i-1]+'\n')
141     print(layout_names[i-1]+'\n')
142     one_layout(article_urls)
143     i += 1
144 
145 print('finish')
146 
147

来了新单位，第一件事情，每天发布新闻动态，熟悉之后就是点开每篇文章复制，然后粘贴到word里，清除格式，再发布到后台。

有时候文章多了，复制很麻烦，于是就想写个爬虫试试。

第一个难点是构造链接，python的datetime模块很好用，再加上字符串的操作，完成。

最开始没有思路，照着网上的爬虫模仿，先分别写几个函数，例如用来访问并获取页面内容的downloadHtml(url)；

接着是首页，可以得到当天的版数，有时4版，有时8版。

接着是遍历每一版，进而获取每一篇文章的链接，配合浏览器和插件xpath定位，决定直接以字符串形式存储。

最后一个while循环，通过版数决定次数。将字符串写入TXT文件，这样格式自动会被统一，再导入word只需要替换一下开头的空行，省去了不少麻烦。

花了几天时间，重新捡起了python，还不错。通过自己的想法，写出来爬虫，节省了工作时间。

后面本来还想再优化一下，做个选择日期的函数，不过换了新网站，不用再传了。有时间还是再完善完善代码，有点乱，变量名称。

posted @ 2021-07-16 11:34 CP喜欢晒太阳阅读(95) 评论(0) 收藏举报

CP喜欢晒太阳

爬数字报

公告