爬取5家公司(如:阿里巴巴、京东、亚马逊、华为、贵州茅台)百度“资讯”新闻的10页内容

将数据以MySQL存储,字段名包括:公司名、新闻标题、网址、新闻来源和时间。

 

import time
import pymysql
import requests
from bs4 import BeautifulSoup
from requests import RequestException
     
     
def get_one_page(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36'
                   + '(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
        response = requests.get(url, headers=headers)
        #response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
     
def parse_one_page(c, text):
    soup = BeautifulSoup(text, 'lxml')
    titles = soup.select('.c-title > a')
    sources = soup.find_all(name='p', class_='c-author') #class是关键字,在后面加一个下划线
    companys = ['阿里巴巴','京东','亚马逊','华为','贵州茅台']
    for i in range(len(titles)):
        data = {
            'company': companys[c],
            'title': titles[i].get_text().strip(),
            'link': titles[i]['href'],
            'source': sources[i].get_text().strip().split('\xa0')[0].strip(),
            'time': sources[i].get_text().strip().split('\xa0')[2].strip()
        }
        yield data
#.string只能获取单个tag的内容,若一个tag里还包含其他子孙的节点则返回None        
#get_text()可以获取到tag中所有文本内容包括子孙tag的内容

def create_sql():
    db = pymysql.connect(host='localhost',user='root',password='123456',port=3306)
    cursor = db.cursor()
    cursor.execute("CREATE DATABASE spiders DEFAULT CHARACTER SET utf8") #创建数据库spiders
    db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders')
    cursor = db.cursor() #创建数据表baidu
    sql=("CREATE TABLE baidu (company VARCHAR(255) NOT NULL,title VARCHAR(255) NOT NULL,link VARCHAR(255) NOT NULL,source VARCHAR(255) NOT NULL,time VARCHAR(255) NOT NULL)")
    cursor.execute(sql)
    db.close()
    
def write_to_sql(data):
    table = 'baidu'
    keys = ', '.join(data.keys())
    values = ', '.join(['%s'] * len(data))
    sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
    db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders')
    cursor = db.cursor()
    try:
        if cursor.execute(sql, tuple(data.values())):
            print('Successful')
            db.commit()
    except:
        print('Failed')
        db.rollback()
    db.close()

    
def main(c, url):
    for pn in range(0, 91, 10):
        link = url + '&x_bfe_rqs=03E80&tngroupname=organic_news&rsv_dl=news_b_pn&pn=' + str(pn)
        text = get_one_page(link)
        for item in parse_one_page(c, text):
            print(item)
            #write_to_sql(item)

if __name__ == '__main__':
    #create_sql()
    companys = ['阿里巴巴','京东','亚马逊','华为','贵州茅台']
    url = "https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd={}&medium=0"
    urls = [url.format(com) for com in companys]
    for c,url in enumerate(urls): #enumerate()可同时列出下标和数据
        main(c, url)
        time.sleep(1)
posted @ 2019-10-22 20:35  oeong  阅读(787)  评论(0编辑  收藏  举报