Selenium 爬虫

from pixiv

免责声明

本人承诺在使用爬虫程序时，仅出于学习和个人技术提升的目的，以了解互联网数据抓取和分析的工作原理。

本人郑重声明：

非商业用途：本人所使用的爬虫程序纯属个人学习之需，绝不用于商业目的或从事任何违法违规活动。
尊重网站规则：在使用爬虫程序时，本人将遵守所爬取网站的robots.txt协议或网站的访问规则，并尊重网站所有者对于爬虫程序的限制。
数据保护和隐私：本人承诺在爬取数据时不会获取、储存或利用任何个人隐私信息，且不会对所访问网站造成不必要的负担或干扰。
技术分享与合作：本人愿意遵守技术共享的原则，不会滥用所学知识，也愿意在合适的情况下与相关人士分享学习心得。
法律遵守：本人承诺遵守当地法律法规，并对于使用爬虫程序可能带来的风险和责任自行承担。

使用目的

这次我使用是为了爬取一个网站的数据，但是这个网站用普通的爬虫只能得到静态的网页，反爬做的比较好，于是我就找到了可以模拟点击浏览器的自动化工具Selenium

其可以通过代码自动打开浏览器，并进行点击，填写，跳转等操作，并可以通过内置方法获取HTML元素

代码

import traceback
from threading import Thread
import requests
from bs4 import BeautifulSoup
import time
import random
from helium import *
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd

chrome_driver_path = r'C:\Users\86150\Documents\chrome-win64\chrome-win64\chrome.exe'
starturl = 'url'


def findQuestion(html01):
    try:
        # 查找问题
        question = html01.find_element(By.CLASS_NAME, "com-mnks-question-detail") \
            .find_element(By.CLASS_NAME, 'timu') \
            .find_element(By.CLASS_NAME, 'timu-text')
        question = str(question.text)
        return question
    except Exception as ex:
        print(f"出现异常Question:f{ex}")
        return "NaN"


def findSelect(html01):
    try:
        # 查找选择
        selectOpt = html01.find_element(By.CLASS_NAME, "com-mnks-question-detail") \
            .find_element(By.CLASS_NAME, 'answer-w') \
            .find_element(By.CLASS_NAME, 'options-w') \
            .find_elements(By.TAG_NAME, 'p')
        selectlist = []
        for p_element in selectOpt:
            text = p_element.text
            selectlist.append(text)
        ans = "\n".join(selectlist)
        return ans
    except Exception as ex:
        print(f"出现异常Select:f{ex}")
        return "NaN"


def findImg(html01):
    try:
        # 查找图片
        imageInf = html01.find_element(By.CLASS_NAME, "com-mnks-question-detail") \
            .find_element(By.CLASS_NAME, 'answer-w') \
            .find_element(By.CLASS_NAME, 'media-w') \
            .find_element(By.TAG_NAME, 'img').get_attribute('src')
        return str(imageInf)
    except Exception as ex:
        print(f"No Img")
        return "NaN"


def findAns(html01, wait):
    try:
        ans_element = html01.find_element(By.CLASS_NAME, "com-shiti-xiangjie") \
            .find_element(By.CLASS_NAME, "xiangjie") \
            .find_element(By.CLASS_NAME, "content")
        ans = str(ans_element.get_attribute("innerHTML"))
        if ans == "":
            ans = "Void"
        return ans
    except Exception as ex:
        print(f"出现异常Ans:f{ex}")
        return "NaN"


def getAns(html01):
    try:
        html01.find_element(By.CLASS_NAME, "tool-bar") \
            .find_element(By.CLASS_NAME, "btn-bar") \
            .find_element(By.CSS_SELECTOR, 'button.right.pt[ref="xiangqing"]').click()
    except Exception as ex:
        print(f'出现异常getAns:{ex}')


def nextOne(html01):
    try:
        html01.find_element(By.CLASS_NAME, "tool-bar") \
            .find_element(By.CLASS_NAME, "btn-bar") \
            .find_element(By.XPATH, "//button[text()='下一题']").click()
        return True
    except Exception as ex:
        print(f"出现异常nextOne:f{ex}")
        return False


def debug(df_train):
    try:
        print(df_train.shape)
        print(df_train.tail(1))
        print("***************************")
        print()
    except Exception as ex:
        print(f"出现异常DEBUG:{ex}")


def getData(url):
    startNum = 0
    df_train = pd.DataFrame({'question': [], 'selectOption': [],
                             'imgUrl': [], 'answer': []})
    # 打开主页
    driver = webdriver.Chrome()
    driver.get(url)
    wait = WebDriverWait(driver, timeout=10)
    time.sleep(10)
    while startNum < 1424:
        startNum = startNum + 1
        try:
            html01 = driver.find_element(By.CLASS_NAME, 'layout-article') \
                .find_element(By.CLASS_NAME, 'news-page') \
                .find_element(By.CLASS_NAME, 'left')
            getAns(html01)

            html01 = driver.find_element(By.CLASS_NAME, 'layout-article') \
                .find_element(By.CLASS_NAME, 'news-page') \
                .find_element(By.CLASS_NAME, 'left')

            wait.until(lambda d: html01.is_displayed())

            a1 = findQuestion(html01)
            a2 = findSelect(html01)
            a3 = findImg(html01)
            a4 = findAns(html01, wait)
            '''
            print(a1)
            print(a2)
            print(a3)
            print(a4)
            '''
            tdf = pd.DataFrame({'question': [a1], 'selectOption': [a2],
                                'imgUrl': [a3], 'answer': [a4]})
            df_train = pd.concat([df_train, tdf])

            debug(df_train)

            flag = nextOne(html01)
            if not flag:
                break
        except Exception as ex:
            print("出现如下异常getData:%s" % ex)
            traceback.print_exc()
    driver.quit()
    return df_train


Data_MONI = getData(starturl)
Data_MONI.to_csv("D:\\moni_Four-DataCsv.csv")
Data_MONI.to_excel('D:\\moni_Four-DataExcel.xlsx')

忠告：wait不能用太多了，否则会出现错误，但是没有提示，而且代码跑起来很慢

基于评论爬取，腾讯云NLP情绪分析，以及LDA主题分析

评论爬取

尝试用scrapy

官网

scrapy的缺陷：

入门较难

需要通过它的方法设置指定文件夹，同时在文件中的py文件中写各种方法/配置，才能够完善各种操作，
然后还要写命令行操作
遇到反爬没有selenium中那么好处理：

import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "https://quotes.toscrape.com/page/1/",
    ]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small::text").get(),
                "tags": quote.css("div.tags a.tag::text").getall(),
            }

        next_page = response.css("li.next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

next_page = response.css("li.next a::attr(href)").get()

官网中的例子是

像这种a标签自带了下一个地址，但是如这种情况

完全由ajax临时申请的数据,scrapy实现爬取，应该比较难

还有一些其他的反爬手段

selenium爬取代码

import traceback
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

# chrome_driver_path = r'C:\Users\86150\Documents\chrome-win64\chrome-win64\chrome.exe'
chrome_driver_path = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
starturls = ['https://you.ctrip.com/sight/jian876/140569.html#ctm_ref=www_hp_bs_lst',
             'https://you.ctrip.com/sight/yanshan2389/70727.html',
             'https://you.ctrip.com/sight/anyi2378/119888.html',
             'https://you.ctrip.com/sight/guangchang2434/1979861.html#ctm_ref=www_hp_bs_lst',
             'https://you.ctrip.com/sight/anyi2378/119888.html#ctm_ref=www_hp_bs_lst',
             'https://you.ctrip.com/sight/lean2414/1703527.html',
             'https://you.ctrip.com/sight/fuliang2420/61146.html#ctm_ref=www_hp_bs_lst',
             'https://you.ctrip.com/sight/wuyuan446/140592.html',
             'https://you.ctrip.com/sight/wuyuan446/72527.html',
             'https://you.ctrip.com/sight/shicheng2415/109915.html',
             'https://you.ctrip.com/sight/wuyuan446/127484.html',
             'https://you.ctrip.com/sight/wuyuan446/141512.html']

fileName = "twoQuestion"


def nextOne(rootHtml, driver):
    try:
        nextButton = rootHtml.find_element(By.CLASS_NAME, "myPagination") \
            .find_element(By.CLASS_NAME, "ant-pagination-next") \
            .find_element(By.CLASS_NAME, 'ant-pagination-item-comment') \
            .find_element(By.TAG_NAME, "a")
        nextButtonClickable = WebDriverWait(driver, 2).until(EC.element_to_be_clickable(nextButton))
        nextButtonClickable.click()
        return True
    except Exception as ex:
        print(f"出现异常nextOne:f{ex}")
        return False


def debug(df_train):
    try:
        print(df_train.shape)
        print(df_train.tail(1).to_string())
        print("***************************")
        print()
    except Exception as ex:
        print(f"出现异常DEBUG:{ex}")


def getData(driver, url, df_train):
    try:
        countNum = 0
        time.sleep(2)
        driver.get(url)
        wait = WebDriverWait(driver, timeout=10)

        rootHtml = driver.find_element(By.CLASS_NAME, 'poiDetailPageWrap')

        wait.until(lambda d: rootHtml.is_displayed())

        position = rootHtml.find_element(By.CLASS_NAME, 'title') \
            .find_element(By.TAG_NAME, 'h1')
        position = str(position.text)

        maxNum = rootHtml.find_element(By.CLASS_NAME, 'hover-underline')
        maxNum = int(re.search(r'\d+', maxNum.text).group())

        while True:
            time.sleep(0.5)
            rootHtml = driver.find_element(By.CLASS_NAME, 'poiDetailPageWrap')
            wait.until(lambda d: rootHtml.is_displayed())

            contents = rootHtml.find_elements(By.CLASS_NAME, 'commentDetail')
            for content in contents:
                content = str(content.text).encode('gbk', 'ignore').decode('gbk')
                tdf = pd.DataFrame({'position': [position], 'content': [content]})
                df_train = pd.concat([df_train, tdf])
                countNum = countNum + 1
                debug(df_train)

            if not nextOne(rootHtml, driver) or countNum >= maxNum:
                break

    except Exception as ex:
        print("出现如下异常getData:%s" % ex)
        traceback.print_exc()

    return df_train


def solve():
    try:
        # 打开主页
        driver = webdriver.Chrome()
        df_train = pd.DataFrame({'position': [], 'content': []})
        for url in starturls:
            df_train = getData(driver, url, df_train)
        df_train.to_csv(f"D:\\爬虫\\tourism\\{fileName}.csv", index=False, encoding="gbk")
        driver.quit()
    except Exception as ex:
        print("出现如下异常solve:%s" % ex)


solve()

使用腾讯云NLP云计算

详细指导文章

基本思想是：

pip 腾讯云的python库
准备好秘钥
利用腾讯云的库，发送请求
接收返回结果

import json
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.nlp.v20190408 import nlp_client, models


def get_sentiment(text):
    try:
        cred = credential.Credential("SecretId",
                                     "Secretkey")
        # 实例化一个http选项，可选的，没有特殊需求可以跳过
        httpProfile = HttpProfile()
        httpProfile.endpoint = "nlp.tencentcloudapi.com"

        # 实例化一个client选项，可选的，没有特殊需求可以跳过
        clientProfile = ClientProfile()
        clientProfile.httpProfile = httpProfile
        # 实例化要请求产品的client对象,clientProfile是可选的
        client = nlp_client.NlpClient(cred, "", clientProfile)

        # 实例化一个请求对象,每个接口都会对应一个request对象
        req = models.AnalyzeSentimentRequest()
        params = {
            "Text": text
        }
        req.from_json_string(json.dumps(params))

        # 返回的resp是一个AnalyzeSentimentResponse的实例，与请求对象对应
        resp = client.AnalyzeSentiment(req)
        resp = json.loads(resp.to_json_string())

        return [resp['Positive'], resp['Negative'], resp['Neutral'], resp['Sentiment']]
        # 输出json格式的字符串回包
        # print(resp.to_json_string())

    except TencentCloudSDKException as err:
        print(err)


import pandas as pd

df = pd.read_csv("D:\\爬虫\\tourism\\oneQuestion.csv", encoding='gbk')
df['情感分析'] = df['content'].apply(get_sentiment)
df[['积极', '消极', '中立', '情感倾向']] = df['情感分析'].apply(pd.Series)
df = df.drop('情感分析', axis=1)
df.to_csv("./sentiment_data.csv", index=False, encoding="gbk")

中文LDA主题分析

基本思想是：

对每一个评论句子进行中文分词，清洗掉停词
得到词频矩阵
调用LDA模型

中文分词 jieba

详细博客

中文停词我用的是nltk的：

from nltk.corpus import stopwords

stop_words = stopwords.words('chinese') #选择英文停止词 去重 得到英文停止词表
custom_stop_words = ["好", "好好", "很", "不错","一个","不","去","都","挺","还","篁","岭","安义","没","婺源"]
# 扩展停词
stop_words.extend(custom_stop_words)

中文LDA

参考博客1
参考博客2


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
#读取语料

def word_cut(text):
    stop_words = stopwords.words('chinese') 
    custom_stop_words = ["好", "好好", "很", "不错","一个","不","去","都","挺","还","篁","岭","安义","没","婺源","瑶里"]
    stop_words.extend(custom_stop_words)
    
    # 结巴分词 cut_all=True 设置为全模式  
    # 需要注意的是这里不需要" ".join(text)，上面需要这么做是因为我得到的是Series中的values
    wordlist = jieba.cut(text)  
    wordlist = [word for word in wordlist if word not in stop_words]
    # 使用空格连接 进行中文分词  
    return " ".join(wordlist)  

df['content_cut']=df['content'].apply(word_cut)

n_features = 2500 #提取1000个特征词语
tf_vectorizer = CountVectorizer(max_features=n_features,
                                max_df=0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df['content_cut'])

# 计算每一行的最大值
#row_max_values = np.max(tf, axis=1)
# 计算每一行的最小值
#row_min_values = np.min(tf, axis=1)

#print("每一行的最大值:", row_max_values)
#print("每一行的最小值:", row_min_values)

#print(tf.toarray())

# 首先计算下困惑度曲线吧
plexs = []
n_max_topics = 12
for i in range(1,n_max_topics):
    print(i)
    lda = LatentDirichletAllocation(n_components=i, max_iter=50,
                                    learning_method='batch',
                                    learning_offset=50,random_state=0)
    lda.fit(tf)
    plexs.append(lda.perplexity(tf))


n_t=n_max_topics-1#区间最右侧的值。注意：不能大于n_max_topics
x=list(range(1,n_t))
plt.plot(x,plexs[1:n_t])
plt.xlabel("number of topics")
plt.ylabel("perplexity")
plt.show()


# 获取主题以及其下的单词
n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='batch',
                                learning_offset=50,
#                                 doc_topic_prior=0.1,
#                                 topic_word_prior=0.01,
                               random_state=0)
lda.fit(tf)


def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        # topic.argsort 是对这个主题下的单词按照权重从小到大排序，于是
        # 我们从默认的-1 开始（即最后一个单词），到第-40个单词为止，取出单词
        message += " ".join([feature_names[i] 
                             for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

n_top_words = 6
tf_feature_names = tf_vectorizer.get_feature_names_out()
topic_word = print_top_words(lda, tf_feature_names, n_top_words)

# pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook(local=True)
 
pic = pyLDAvis.sklearn.prepare(lda,  tf, tf_vectorizer)
 
pyLDAvis.display(pic) #在notebook的output cell中显示

解决中文词云乱码问题

博客

代码体现在：

# 对分词后的文本生成词云  
my_wordcloud = WordCloud( background_color = 'white',    
                            mask = nana_coloring,         
                            max_words = 1000,            
                            stopwords = stop_words,       
                            max_font_size = 150,          
                            random_state = 30,   
                            font_path='msyh.ttc'
                            )

:font_path='msyh.ttc'

词云的全部代码为:

#coding=gbk
from os import path  
from imageio import imread  
import jieba  
import sys  
import matplotlib.pyplot as plt  
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator    
%matplotlib inline
from collections import Counter
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('chinese') #选择英文停止词 去重 得到英文停止词表
custom_stop_words = ["好", "好好", "很", "不错","一个","不","去","都","挺","还","篁","岭","安义","没","婺源"]
stop_words.extend(custom_stop_words)

# 结巴分词 cut_all=True 设置为全模式   
#t_pos_text=pos_text.tostring().decode("gbk", errors="ignore")
t_pos_text=" ".join(pos_text)
#print(t_pos_text)
#print(t_pos_text2)
wordlist = jieba.cut(t_pos_text)     #cut_all = True  

# 使用空格连接 进行中文分词  
wl_space_split = " ".join(wordlist)  
#print(wl_space_split)

d = %pwd 
nana_coloring = imread(path.join(d, "古叶.jpg"))  
# 对分词后的文本生成词云  
my_wordcloud = WordCloud( background_color = 'white',    
                            mask = nana_coloring,         
                            max_words = 1000,            
                            stopwords = stop_words,       
                            max_font_size = 150,          
                            random_state = 30,   
                            font_path='msyh.ttc'
                            )  

my_wordcloud.generate(wl_space_split)  

my_wordcloud = my_wordcloud.recolor(color_func=lambda *args, **kwargs: "green")

plt.figure(figsize=(10, 8))
plt.imshow(my_wordcloud)    # 显示词云图  
plt.axis("off")             # 是否显示x轴、y轴下标  
plt.show()