返回顶部

数据采集与融合第五次个人作业

写在前面

这次的作业是比较有挑战性而且比较实用的,mooc 的那次作业很考验对 selenium 一些操作的熟悉程度,有很多坑要自己走过才会知道。但是打完这些项目之后会发现 selenium 这种自动化测试工具是永远的神。一个月前:bs4 + re 不香吗,xpath 什么玩意... 现在:selenium 天下第一!

第一题

第一题,京东的反爬很 low 所以领会一下题意,输入关键字直接开始爬就行,照着课件打,练练手。

  • 结果

  • 代码

import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import urllib.request
import threading
import sqlite3
import os
import time


class MySpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}

    imagePath = "download"

    def startUp(self, url, key):
        self.driver = webdriver.Edge("C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")
        self.threads = []
        self.No = 0
        self.imgNo = 0
        try:
            self.con = sqlite3.connect("phones.db")
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute("drop table phones")
            except:
                pass
            try:
                sql = "create  table  phones  (mNo  varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))"
                self.cursor.execute(sql)
            except:
                pass
        except Exception as e1:
            print(e1)

        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img)
                os.remove(s)
        except Exception as e2:
            print(e2)
        self.driver.get(url)
        keyInput = self.driver.find_element_by_id("key")
        keyInput.send_keys(key)
        keyInput.send_keys(Keys.ENTER)

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print("err2")

    def insertDB(self, mNo, mMark, mPrice, mNote, mFile):
        try:
            sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (?,?,?,?,?)"
            self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
        except Exception as e3:
            print(e3)

    def showDB(self):
        try:
            con = sqlite3.connect("phones.db")
            cursor = con.cursor()
            print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note"))
            cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")

            rows = cursor.fetchall()
            for row in rows:
                print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4]))

            con.close()
        except Exception as err:
            print(err)

    def download(self, src1, src2, mFile):
        data = None
        if src1:
            try:
                req = urllib.request.Request(src1, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if not data and src2:
            try:
                req = urllib.request.Request(src2, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)

    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            for li in lis:
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except:
                    src1 = ""
                try:
                    src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"

                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东\n", "")
                    note = note.replace(",", "")
                except:
                    note = ""
                    mark = ""
                self.No = self.No + 1
                no = str(self.No)
                while len(no) < 6:
                    no = "0" + no
                print(no, mark, price)
                if src1:
                    src1 = urllib.request.urljoin(self.driver.current_url, src1)
                    p = src1.rfind(".")
                    mFile = no + src1[p:]
                elif src2:
                    src2 = urllib.request.urljoin(self.driver.current_url, src2)
                    p = src2.rfind(".")
                    mFile = no + src2[p:]
                if src1 or src2:
                    T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.insertDB(no, mark, price, note, mFile)
            try:
                self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
            except:
                nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                time.sleep(10)
                nextPage.click()
                self.processSpider()
        except Exception as err:
            print(err)

    def executeSpider(self, url, key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")

url = "http://www.jd.com"
spider = MySpider()
while True:
    print("1.爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择(1,2,3):")
    if s == "1":
        spider.executeSpider(url, "手机")
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

第二题

第二题还是比较简单的,其实在上次作业就已经用 selenium 实现过了,算是对第三题开始前的一个练手。

  • 先贴一下数据库里的结果

  • 代码如下

import selenium
from selenium import webdriver
import pymssql
import time

class stock_pack():
    def initialize(self):
        self.conn = pymssql.connect(host="localhost", user="sa", password="xrf5623641", database="xrfdb")
        self.cursor = self.conn.cursor()
        self.cursor.execute("delete from stocks")
        driver = webdriver.Edge("C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")
        self.get_data(driver,self.cursor)
        self.conn.commit()
        self.conn.close()

    def insertdata(self,code, name, newest_price, up_down_extent, up_down_value, deal_volume, deal_value, freq, highest, lowest,
                 opening, over):
        try:
            self.cursor.execute(
                "insert into stocks (stockcode,stockname,stocknp,stockude,stockudv,stockdv1,stockdv2,stockfreq,stockhighest,stocklowest,stockopening,stockover) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                (code, name, newest_price, up_down_extent, up_down_value, deal_volume, deal_value, freq, highest, lowest,
                 opening, over))
        except Exception as e:
            print(e)

    def get_data(self,driver,cursor):
        driver.get("http://quote.eastmoney.com/center/gridlist.html#hs_a_board")
        try:
            for cnt in range(5):
                tr_list = driver.find_elements_by_xpath("//div[@class='listview full']/table[@class='table_wrapper-table']/tbody/tr")
                for tr in tr_list:
                    tr_val = tr.text.split(" ")
                    code = tr_val[1]
                    name = tr_val[2]
                    newest_price = tr_val[6]
                    up_down_extent = tr_val[7]
                    up_down_value = tr_val[8]
                    deal_volume = tr_val[9]
                    deal_value = tr_val[10]
                    freq = tr_val[11]
                    highest = tr_val[12]
                    lowest = tr_val[13]
                    opening = tr_val[14]
                    over = tr_val[15]
                    self.insertdata(code,name,newest_price,up_down_extent,up_down_value,deal_volume,deal_value,freq,highest,lowest,opening,over)
                # go to next page
                driver.find_elements_by_xpath("//a[@class='next paginate_button']")[-1].click()
                time.sleep(5)

        except Exception as e:
            print(e)

instance = stock_pack().initialize()

  • 代码思路
    所有的题目代码思路都很简单:就是找到想要的数据到底在哪,然后一步步接近就可以了,很明显这个题目我们需要的数据在一个表格中,只要看懂表格的 HTML 源码那么这些数据就是手到擒来的。这次的作业就是在上次我提交的作业的基础上用类进行了封装,其他功能基本一致。

第三题

第三题很考验对 selenium 的掌握,不仅仅是考察 xpath ,还考察 selenium 中对 driver 的页面跳转的理解。

  • 代码思路
    根据题目要求需要先进行登录,然后输入关键词进行搜索,在从课程列表中爬取出想要的信息。所以代码的编写也是分为这三步,第一步登录比较简单,其实就是对 xpath 的简单考察,找到相应的元素来写 xpath 即可。但是呢这个地方有个坑,就是 selenium 不能在不同的 frame 里跳转,需要手动来找到需要爬取的信息在哪个 frame 然后跳转;第二步就是找到搜索栏输入关键字也是对 xpath 的考察;第三步爬取相应信息除了对 xpath 的考察外还需要使 driver 在不同的标签页进行跳转,需要自己踩坑。

  • 代码运行流程

本代码的运行流程是打开 mooc 的网站然后先关闭弹窗(弹窗可能已经消失,只需要把login函数中关闭弹窗的语句注释掉即可),寻找登录框,点击用手机号登录,自动输入手机和密码,自动在搜索框输入关键词,搜索之后点击列表里每个项爬取信息,翻页,重复爬取,之后存入数据库,啊哈一气呵成~

  • 先上运行结果,把爬到的信息先输出了

  • 再贴一下数据库中的结果

  • 源代码:

import selenium
from selenium import webdriver
import pymssql
import time

class MyConsumingSpider():
    driver = webdriver.Edge("C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")
    url = "https://www.icourse163.org/"
    # you can change your keyword,it should work normally
    keyword = "深度学习"
    cnt = 0

    def login_and_go(self):
        # usage : open the source page and login into your account then search with keyword
        self.driver.get(self.url)
        self.driver.maximize_window()
        time.sleep(2)
        # close the evil popup,but it may not show up
        self.driver.find_element_by_xpath("//i[@class='ux-icon ux-icon-close']").click()
        
        self.driver.find_element_by_xpath("//a[@class='f-f0 navLoginBtn']").click() # choose login
        time.sleep(2)
        self.driver.find_element_by_xpath("//div[@class='ux-login-set-scan-code_ft']//span[@class='ux-login-set-scan-code_ft_back']").click() # choose other way
        time.sleep(2)
        self.driver.find_elements_by_xpath("//div[@class='ux-tabs-underline']//ul[@class='ux-tabs-underline_hd']//li")[1].click() # choose login with tel
        time.sleep(2)

        # when page have many iframes you need to use selenium with switch_to_iframe to go to the iframe what you want
        temp_iframe_id = self.driver.find_elements_by_tag_name('iframe')[1].get_attribute('id') # choose iframe what you want
        self.driver.switch_to_frame(temp_iframe_id)

        # set username and password to login
        self.driver.find_element_by_xpath("//input[@id='phoneipt']").send_keys("******")
        time.sleep(1)
        self.driver.find_element_by_xpath("//input[@class='j-inputtext dlemail']").send_keys("******")
        time.sleep(0.5)
        self.driver.find_element_by_xpath("//div[@class='f-cb loginbox']//a[@class='u-loginbtn btncolor tabfocus ']").click()
        time.sleep(5)

        # search with keyword
        self.driver.find_element_by_xpath("//div[@class='u-baseinputui']//input[@name='search']").send_keys(self.keyword)
        time.sleep(1)
        self.driver.find_element_by_xpath("//span[@class='u-icon-search2 j-searchBtn']").click()
        time.sleep(3)

    def insert_data_into_DB(self):
        # usage : find the element we need and insert into your DB
        res = self.driver.find_elements_by_xpath("//div[@id='j-courseCardListBox']//div[@class='m-course-list']//div//div[@class='u-clist f-bgw f-cb f-pr j-href ga-click']")
        # when you click other page,dont forget to use window_switch to change your new page,otherwise you will turn around in your primitive page! f**king bug again!
        for r in res:
            # open the pages we need in advance
            r.click()
            time.sleep(1)
        window = self.driver.window_handles
        for i in range(len(window)):
            if (i != len(window) - 1): # should not be our main_page
                # extract parameter
                self.cnt += 1
                self.driver.switch_to_window(window[len(window) - i - 1])
                time.sleep(3)
                name = self.driver.find_element_by_xpath("//span[@class='course-title f-ib f-vam']").text.strip()
                person = self.driver.find_elements_by_xpath("//div[@class='um-list-slider_con_item']")
                first_teacher = person[0].find_element_by_xpath("//div[@class='cnt f-fl']//h3[@class='f-fc3']").text.strip()
                team_member = ""
                for j in range(1, len(person)):
                    team_member += person[j].text.strip().split("\n")[0]
                    team_member += " "
                hot = self.driver.find_element_by_xpath("//span[@class='course-enroll-info_course-enroll_price-enroll_enroll-count']").text.strip()
                note = self.driver.find_element_by_xpath("//div[@class='course-heading-intro_intro']").text.strip()
                ntime = self.driver.find_element_by_xpath("//div[@class='course-enroll-info_course-info_term-info_term-time']").text.strip()
                self.cursor.execute("insert into mooc (course_idx,course_name,course_teacher,course_team,course_hot,course_time,course_note) VALUES (%s,%s,%s,%s,%s,%s,%s)",(self.cnt,name,first_teacher,team_member,hot,ntime,note))
                print(self.cnt,name, first_teacher, team_member, hot, ntime, note)
        # switch among your pages
        for i in range(1,len(window)):
            self.driver.switch_to_window(window[i])
            time.sleep(0.5)
            self.driver.close()
        # back to main_page and prepare to turn to next page
        window = self.driver.window_handles
        self.driver.switch_to_window(window[-1])
        
    def connect_to_SQLSERVER(self):
        # usage : connect to DB
        self.conn = pymssql.connect(host="localhost", user="sa", password="******", database="xrfdb")
        self.cursor = self.conn.cursor()
        self.cursor.execute("delete from mooc")

    def close_DB(self):
        # usage : close your DB
        self.conn.commit()
        self.conn.close()

    def start_my_spider(self):
        # main
        self.connect_to_SQLSERVER()
        self.login_and_go()
        for i in range(3):
            self.insert_data_into_DB()
            # turn to next page
            self.driver.find_element_by_xpath("//li[@class='ux-pager_btn ux-pager_btn__next']//a[@class='th-bk-main-gh']").click()
            time.sleep(5)
        self.close_DB()

ms = MyConsumingSpider().start_my_spider()
  • 总结
    这个 mooc 项目的代码量其实不大,但是自己做的有点久,是因为之前不想在列表中点击每个课程去获取它的详情信息,直接在课程列表页面做然后在那个页面翻页(说白了就是懒/(ㄒoㄒ)/~~),但是做着做着发现最后一个需要的字段找不到,还得点进去课程详情找信息,就又重写了一遍主干部分,增加了许多奇奇怪怪的知识,所以对于 selenium 这种自动化测试工具确实有必要花时间来试试毒(ง •_•)ง。
    所以这个项目的第一个收获是深入了解了 xpath 的写法,以及 selenium 内置的一些自动化测试的函数以及工作原理,并且加深了对数据库存取和读写操作的认识,知道了要完整的从一个页面来爬取信息需要经历的步骤。
    第二个收获是从这个项目中解决了一个之前未解决的问题,之前做过用 selenium 来爬取网抑云评论制作词云,由于网易云的翻页是在单页面加载的,难度其实比这个作业要小,当时卡在了翻页,从这次作业中完美解决,算是一箭双雕吧~

  • 关于代码
    一个完整的代码需要经过多次的测试和修改,比如我昨晚刚写完今天来跑突然跑不动了,找了下原因发现是因为今天 mooc 网站在首页给爷加了个弹窗,导致无法自动点击登录按钮,所以就得不断补充代码来维护原有代码(但是过一两天弹窗又会消失)。其次我的代码还有一个较大的缺陷就是没有使用 try-except 来捕捉异常,这是不可取的,有时间再继续添加叭~

posted @ 2020-11-12 14:37  King_James23  阅读(188)  评论(0编辑  收藏  举报