爬虫获取新浪国内热点新闻(初学)——python2.7

# -*- coding:utf-8 -*-
import time
import sys

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC

reload(sys)
sys.setdefaultencoding('utf-8')


def get_driver(url):
    chrome_driver = r"C:\Python27\chromedriver.exe"
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)

    driver.get(url)
    time.sleep(2)
    return driver


def get_url(driver):
    url_list = []
    url_li = driver.find_elements_by_xpath("/html/body/div[7]/div[2]/div[1]/div[2]/div/ul[1]/li/span[2]/a")
    for url in url_li:
        # print url.text
        # print url.get_attribute("href")
        url = url.get_attribute("href")
        url_list.append(url)
    return url_list


def get_text(driver):
    res_title = driver.find_element_by_class_name("main-title")
    # print res_title.text
    res_data = driver.find_element_by_class_name("article")
    # print res_data.text
    with open("sina_new\\"+res_title.text+".txt","w") as f:
        f.write(unicode("\xEF\xBB\xBF", "utf-8"))    # 关于【with opne】python2.7没有encoding,所以需要先写入此行信息。
        f.write(res_data.text)    


if __name__ == '__main__':
    url = "https://news.sina.com.cn/china/"
    driver = get_driver(url)
    url_list = get_url(driver)
    for url in url_list:
        driver = get_driver(url)
        get_text(driver)

posted @ 2022-10-19 17:16  vetra  阅读(62)  评论(0)    收藏  举报