数据结构化与保存

1. 将新闻的正文内容保存到文本文件。

def writeToDocument(filename, content):
    f = open(filename, 'a', encoding='utf-8')
    f.write(content)
    f.close()

 

2. 将新闻数据结构化为字典的列表:

(1)单条新闻的详情-->字典news

def getNewsDetail(newsUrl):
    resdet = requests.get(newsUrl)
    resdet.encoding = 'utf-8'
    soupdet = BeautifulSoup(resdet.text, 'html.parser')
    news = {}

    news['title'] = soupdet.select('.show-title')[0].text

    if (soupdet.select('.show-info')):
        showinfo = soupdet.select('.show-info')[0].text
        date = showinfo.lstrip("发布时间:")[:19]
        news['dateTime'] = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

        if (showinfo.find('作者') > 0):
            news['author'] = re.search('作者:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1)
            # newsdetail['author'] = re.search('作者:(.*)\s*审|来|摄|点', showinfo).group(1)
        else:
            news['author'] = 'none'

        if (showinfo.find('审核') > 0):
            news['checker'] = re.search('审核:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1)
            # newsdetail['checker'] = re.search('审核:(.*)\s*来|摄|点', showinfo).group(1)
        else:
            news['checker'] = 'none'

        if (showinfo.find('来源') > 0):
            news['source'] = re.search('来源:(.*)\s*摄|点', showinfo).group(1)
        else:
            news['source'] = 'none'

        if (showinfo.find('摄影') > 0):
            news['photographer'] = re.search('摄影:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1)
            # newsdetail['photographer'] = re.search('摄影:(.*)\s*点', showinfo).group(1)
        else:
            news['photographer'] = 'none'

        news['clicktimes'] = getClickCount(newsUrl)
    else:
        return

    if (soupdet.select('.show-content')):
        news['contentdetail'] = soupdet.select('#content')[0].text
    else:
        return

    news['newsUrl'] = newsUrl

    # writeToDocument('gzccNews.txt', contentdetail)

    # print("发表时间:{0}  作者:{1}  审核:{2}  来源:{3}  摄像:{4}  点击次数:{5} 次".format(
    #     news['dateTime'], news['author'], news['checker'], news['source'], news['photographer'], news['clicktimes']))
    # print(newsdetail['contentdetail'])
    # print(newsdetail)

    return news

 

(2)一个列表页所有单条新闻汇总-->列表newsls.append(news)

def getListDetail(ListPageUrl):
    resl = requests.get(ListPageUrl)
    resl.encoding = 'utf-8'
    soupl = BeautifulSoup(resl.text, 'html.parser')
    gzccNewslist = {}
    newsls = []
    for news in soupl.select('li'):
        if len(news.select('.news-list-title')) > 0:
            gzccNewslist['title'] = news.select('.news-list-title')[0].text
            gzccNewslist['description'] = news.select('.news-list-description')[0].text
            gzccNewslist['info'] = news.select('.news-list-info')[0].text
            gzccNewslist['address'] = news.select('a')[0]['href']

            # print("\n标题: {0}\n描述: {1}\n信息: {2}\n链接: {3}".format(
            #     gzccNewslist['title'], gzccNewslist['description'], gzccNewslist['info'], gzccNewslist['address']))

            # print(list)

            newsls.append(getNewsDetail(gzccNewslist['address']))
    return newsls

 

(3)所有列表页的所有新闻汇总列表newstotal.extend(newsls)

locale.setlocale(locale.LC_CTYPE, 'chinese')
newstotal = []
Listurl = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
pagecount = getPageNum(Listurl)
for i in range(1, pagecount + 1):
    if (i == 1):
        ListPageUrl = Listurl
    else:
        ListPageUrl = Listurl + '{}.html'.format(i)
    newstotal.extend(getListDetail(ListPageUrl))
    break

 

3. 安装pandas,用pandas.DataFrame(newstotal),创建一个DataFrame对象df.

df = pandas.DataFrame(gzccNews)

 

4. 通过df将提取的数据保存到csv或excel 文件。

df.to_excel('gzccnews.xlsx')

 

5. 用pandas提供的函数和方法进行数据分析:

(1)提取包含点击次数、标题、来源的前6行数据

df[['clicktimes', 'title', 'source']].head(6)
print(df[['clicktimes', 'title', 'source']].head(6))

 

(2)提取‘学校综合办’发布的,‘点击次数’超过3000的新闻。

df[(df['clicktimes'] > 3000) & (df['source'] == '学校综合办')]
print(df[(df['clicktimes'] > 3000) & (df['source'] == '学校综合办')])

 

(3)提取'国际学院'和'学生工作处'发布的新闻。

soulist = ['国际学院', '学生工作处']
print(df[df['source'].isin(soulist)])

 

posted @ 2018-04-11 19:38  165邝启彬  阅读(258)  评论(0编辑  收藏  举报