(1)cctv-获取内容
内容解析
目录结果

#weixin_wechat/xinwenlianbo/get_cctv_xinwenlianbo.py
# -*- coding: utf-8 -*-
"""
爬虫源:[新闻联播](https://tv.cctv.com/lm/xwlb/index.shtml?spm=C31267.PFsKSaKh6QQC.EYAqX3oMSyve.3 )]
"""
import requests
import re
import time
#import pprint
#from bs4 import BeautifulSoup
from datetime import datetime ,timedelta
import json
from lxml import etree
# 輸入結果列表
def get_html_loop(url,loop=2):
response_url = None
statuscode = 0
response_html = None
if url is None or len(url) ==0:
print('one invalid url is found!')
return response_html
print('url:'+url)
for i in range(loop):
try:
response = requests.get(url)
response.encoding = 'utf-8'
if response.status_code == 200:
response_html=response.text
response_url = response.url
statuscode = response.status_code
else:
print('response from %s is invalid!' % url)
break
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(1)
print("Was a nice sleep, now let me continue...")
continue
return response_html
# url = "https://tv.cctv.com/lm/xwlb/day/20210222.shtml"
# response_html = get_html_loop(url)
# print(response_html)
def get_content(response_html):
html = etree.HTML(response_html)
result_title = html.xpath('//div[@class="title"]//text()')
items_href = html.xpath('//li/a/@href')
return result_title,items_href
#title_list, items_href= get_content(response_html)
def handle_toc(title_list):
title_list = [ value.replace('[视频]','') for value in title_list]
title_toc = [ str(title_list.index(value)+1)+'.'+value for value in title_list]
title_toc_html = '<ul style="list-style-type:none;">\n' + '\n'.join(['<li>'.rjust(8) + name + '</li>' for name in title_toc]) + '\n</ul>'
return title_toc_html
pass
# def handle_href_toc(href_toc):
# response_html = get_html_loop(href_toc,loop=2)
# html = etree.HTML(response_html)
# content =html.xpath('//*[@id="chbox01"]/div[2]/div[1]/div/div[1]/p[3]//text()')
# """
# 标题时间直接使用基础url的title拼接
# """
# content = handle_string(str(content))
# content = content.replace('视频简介:','').replace('本期节目主要内容:','今天央视新闻联播直播主要内容新鲜事:')
# #content = content.replace('视频简介:本期节目主要内容:\\r\\n','今天央视新闻联播直播主要内容新鲜事:').replace("'',",'')
# print(content)
# items = content.split('\\r\\n')
# print('列表长度',len(items))
# if len(items) >3 :
# items = items
# elif len(items) <3:
# items = content.splitlines()
# else :
# items = re.split(r"[~\r\n]+", content)
# items_to_html = '<ul style="list-style-type:none;">\n' + '\n'.join(['<li>'.rjust(8) + name + '</li>' for name in items]) + '\n</ul>'
# return items_to_html
def handle_href_toc(href_toc):
response_html = get_html_loop(href_toc,loop=2)
html = etree.HTML(response_html)
content =html.xpath('//*[@id="chbox01"]/div[2]/div[1]/div/div[1]/p[3]//text()')
"""
标题时间直接使用基础url的title拼接
"""
content = handle_string(str(content))
content = content.replace('视频简介:,','').replace('本期节目主要内容:','今天央视新闻联播直播主要内容新鲜事:')
#content = content.replace('视频简介:本期节目主要内容:\\r\\n','今天央视新闻联播直播主要内容新鲜事:').replace("'',",'')
print(content)
content = re.sub('\s+', '', str(content)).strip()
content = re.sub(r'\\r\\n','',content).strip()
content = re.sub(r"'',",'',content).strip()
content = re.sub(r"','",'',content).strip()
content = content.replace(' ','')
items = content.split(';')
print('列表长度',len(items))
items_to_html = '<ul style="list-style-type:none;">\n' + '\n'.join(['<li>'.rjust(8) + name + '</li>' for name in items]) + '\n</ul>'
return items_to_html
#title_toc_html = handle_toc(title_list)
def handle_href(href):
response_html = get_html_loop(href,loop=2)
html = etree.HTML(response_html)
href_title = html.xpath('//div[@class="cnt_nav"]/h3//text()')
href_title = handle_string(str(href_title))
href_content = html.xpath('//div[@class="mbd"]/div[@class="cnt_bd"]//text()')
href_content = handle_string(str(href_content))
href_source = html.xpath('//div[@class="mtab_con"]/div[@class="chblock"]//text()')
href_source = handle_string(str(href_source)).replace('\\r\\n','').replace('\\t','').replace(',','')
href_pic = html.xpath('//meta[@property="og:image"]/@content')
href_pic = handle_string(str(href_pic)).replace('//','')
return href_title,href_content,href_source,href_pic
def handle_href_html(index,href):
response_html = get_html_loop(href,loop=2)
html = etree.HTML(response_html)
"""
标题处理
"""
href_title = html.xpath('//div[@class="cnt_nav"]/h3//text()')
href_title = handle_string(str(href_title))
href_title =handle_h1_style(index,href_title)
"""
内容处理
"""
href_content = html.xpath('//div[@class="mbd"]/div[@class="cnt_bd"]')[0]
href_content = etree.tostring(href_content, pretty_print = True).decode()
## print(etree.tostring(href_content,pretty_print=True,encoding='unicode'))
#-------
href_source = html.xpath('//div[@class="mtab_con"]/div[@class="chblock"]//text()')
href_source = handle_string(str(href_source)).replace('\\r\\n','').replace('\\t','').replace(',','')
#----
href_pic = html.xpath('//meta[@property="og:image"]/@content')
href_pic = handle_string(str(href_pic)).replace('//','')
return href_title,href_content,href_source,href_pic
def handle_string(text):
"""
处理视频、[]
"""
text = text.replace('[视频]','').replace('[','').replace(']','')
"""
处理''
"""
text = text.replace("'",'')
return text
def handle_h1_style(index,text):
text = "<p><h3><strong>" + str(index)+'.'+text + "</strong></h3></p>"
return text
def get_url(string_date):
base_url = 'https://tv.cctv.com/lm/xwlb/day/'
url = base_url+string_date+'.shtml'
return url
def get_cctv_content():
today = datetime.today()
today_weekday_name = today.strftime('%A')
yesterday = datetime.today() - timedelta(days=1)
yesterday = yesterday.strftime('%Y%m%d')
string_date = yesterday
url = get_url(string_date)
#url = "https://tv.cctv.com/lm/xwlb/day/20210222.shtml"
response_html = get_html_loop(url)
title_list, items_href= get_content(response_html)
title_toc_html = handle_toc(title_list)
#--
#--
href = items_href[1]
href_title,href_content,href_source,href_pic = handle_href_html(href)
#--
print(href_title)
return title_toc_html
if __name__ == "__main__":
start_time = time.time() # 开始时间
title_toc_html = get_cctv_content()
print(title_toc_html)
end_time = time.time() #结束时间
print("程序耗时%f秒." % (end_time - start_time))

浙公网安备 33010602011771号