网站更新内容:请访问: https://bigdata.ministep.cn/

cctv新闻联播的title的toc目录获取

# -*- coding: utf-8 -*-
import requests 
import re
import time
#import pprint
#from bs4 import BeautifulSoup
from datetime import datetime ,timedelta
import json
from lxml import etree

# 輸入結果列表
def get_html_loop(url,loop=2):
    response_url = None
    statuscode = 0
    response_html = None
    if url is None or len(url) ==0:
        print('one invalid url is found!')
        return response_html
    print('url:'+url)
    for i in range(loop):
        try:
            response = requests.get(url)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                response_html=response.text
                response_url = response.url
                statuscode = response.status_code
            else:
                print('response from %s is invalid!' % url)
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(1)
            print("Was a nice sleep, now let me continue...")
            continue
    return response_html


def get_content(response_html):
    html =  etree.HTML(response_html)
    result_title = html.xpath('//div[@class="title"]//text()')
    items_href = html.xpath('//li/a/@href')
    return result_title,items_href

def handle_href_toc(href_toc):
    response_html = get_html_loop(href_toc,loop=2)
    html =  etree.HTML(response_html)
    content =html.xpath('//*[@id="chbox01"]/div[2]/div[1]/div/div[1]/p[3]//text()')
    """
    标题时间直接使用基础url的title拼接
    """
    content = handle_string(str(content))
    content = content.replace('视频简介:,','').replace('本期节目主要内容:','今天央视新闻联播直播主要内容新鲜事:')
    #content = content.replace('视频简介:本期节目主要内容:\\r\\n','今天央视新闻联播直播主要内容新鲜事:').replace("'',",'')
    print(content)
    content = re.sub('\s+', '', str(content)).strip()
    content = re.sub(r'\\r\\n','',content).strip()
    content = re.sub(r"'',",'',content).strip()
    content = re.sub(r"','",'',content).strip() 
    content = content.replace(' ','')
    items = content.split(';')
    print('列表长度',len(items))
    items_to_html = '<ul style="list-style-type:none;">\n' + '\n'.join(['<li>'.rjust(8) + name + '</li>' for name in items]) + '\n</ul>'
    return items_to_html

def get_url(string_date):
    base_url = 'https://tv.cctv.com/lm/xwlb/day/'
    url = base_url+string_date+'.shtml'
    return url


def handle_string(text):
    """
    处理视频、[]
    """
    text = text.replace('[视频]','').replace('[','').replace(']','')
    """
    处理''
    """
    text = text.replace("'",'')
    return text

today = datetime.today() 
today_weekday_name = today.strftime('%A')
yesterday  = datetime.today() - timedelta(days=1)
string_yesterday = yesterday.strftime('%Y%m%d')
string_date = string_yesterday
url = get_url(string_date)
response_html = get_html_loop(url)
title_list, items_href= get_content(response_html)
title_toc_html = handle_href_toc(href_toc=items_href[0])

posted @ 2021-02-24 12:15  ministep88  阅读(108)  评论(0)    收藏  举报
网站更新内容:请访问:https://bigdata.ministep.cn/