数据采集与融合第五次个人作业

写在前面

这次的作业是比较有挑战性而且比较实用的，mooc 的那次作业很考验对 selenium 一些操作的熟悉程度，有很多坑要自己走过才会知道。但是打完这些项目之后会发现 selenium 这种自动化测试工具是永远的神。一个月前：bs4 + re 不香吗，xpath 什么玩意... 现在：selenium 天下第一！

第一题

第一题，京东的反爬很 low 所以领会一下题意，输入关键字直接开始爬就行，照着课件打，练练手。

结果
代码

import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import urllib.request
import threading
import sqlite3
import os
import time


class MySpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}

    imagePath = "download"

    def startUp(self, url, key):
        self.driver = webdriver.Edge("C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")
        self.threads = []
        self.No = 0
        self.imgNo = 0
        try:
            self.con = sqlite3.connect("phones.db")
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute("drop table phones")
            except:
                pass
            try:
                sql = "create  table  phones  (mNo  varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))"
                self.cursor.execute(sql)
            except:
                pass
        except Exception as e1:
            print(e1)

        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img)
                os.remove(s)
        except Exception as e2:
            print(e2)
        self.driver.get(url)
        keyInput = self.driver.find_element_by_id("key")
        keyInput.send_keys(key)
        keyInput.send_keys(Keys.ENTER)

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print("err2")

    def insertDB(self, mNo, mMark, mPrice, mNote, mFile):
        try:
            sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (?,?,?,?,?)"
            self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
        except Exception as e3:
            print(e3)

    def showDB(self):
        try:
            con = sqlite3.connect("phones.db")
            cursor = con.cursor()
            print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note"))
            cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")

            rows = cursor.fetchall()
            for row in rows:
                print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4]))

            con.close()
        except Exception as err:
            print(err)

    def download(self, src1, src2, mFile):
        data = None
        if src1:
            try:
                req = urllib.request.Request(src1, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if not data and src2:
            try:
                req = urllib.request.Request(src2, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)

    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            for li in lis:
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except:
                    src1 = ""
                try:
                    src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"

                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东\n", "")
                    note = note.replace(",", "")
                except:
                    note = ""
                    mark = ""
                self.No = self.No + 1
                no = str(self.No)
                while len(no) < 6:
                    no = "0" + no
                print(no, mark, price)
                if src1:
                    src1 = urllib.request.urljoin(self.driver.current_url, src1)
                    p = src1.rfind(".")
                    mFile = no + src1[p:]
                elif src2:
                    src2 = urllib.request.urljoin(self.driver.current_url, src2)
                    p = src2.rfind(".")
                    mFile = no + src2[p:]
                if src1 or src2:
                    T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.insertDB(no, mark, price, note, mFile)
            try:
                self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
            except:
                nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                time.sleep(10)
                nextPage.click()
                self.processSpider()
        except Exception as err:
            print(err)

    def executeSpider(self, url, key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")

url = "http://www.jd.com"
spider = MySpider()
while True:
    print("1.爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择(1,2,3):")
    if s == "1":
        spider.executeSpider(url, "手机")
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

第二题

第二题还是比较简单的，其实在上次作业就已经用 selenium 实现过了，算是对第三题开始前的一个练手。

先贴一下数据库里的结果
代码如下

import selenium
from selenium import webdriver
import pymssql
import time

class stock_pack():
    def initialize(self):
        self.conn = pymssql.connect(host="localhost", user="sa", password="xrf5623641", database="xrfdb")
        self.cursor = self.conn.cursor()
        self.cursor.execute("delete from stocks")
        driver = webdriver.Edge("C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")
        self.get_data(driver,self.cursor)
        self.conn.commit()
        self.conn.close()

    def insertdata(self,code, name, newest_price, up_down_extent, up_down_value, deal_volume, deal_value, freq, highest, lowest,
                 opening, over):
        try:
            self.cursor.execute(
                "insert into stocks (stockcode,stockname,stocknp,stockude,stockudv,stockdv1,stockdv2,stockfreq,stockhighest,stocklowest,stockopening,stockover) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                (code, name, newest_price, up_down_extent, up_down_value, deal_volume, deal_value, freq, highest, lowest,
                 opening, over))
        except Exception as e:
            print(e)

    def get_data(self,driver,cursor):
        driver.get("http://quote.eastmoney.com/center/gridlist.html#hs_a_board")
        try:
            for cnt in range(5):
                tr_list = driver.find_elements_by_xpath("//div[@class='listview full']/table[@class='table_wrapper-table']/tbody/tr")
                for tr in tr_list:
                    tr_val = tr.text.split(" ")
                    code = tr_val[1]
                    name = tr_val[2]
                    newest_price = tr_val[6]
                    up_down_extent = tr_val[7]
                    up_down_value = tr_val[8]
                    deal_volume = tr_val[9]
                    deal_value = tr_val[10]
                    freq = tr_val[11]
                    highest = tr_val[12]
                    lowest = tr_val[13]
                    opening = tr_val[14]
                    over = tr_val[15]
                    self.insertdata(code,name,newest_price,up_down_extent,up_down_value,deal_volume,deal_value,freq,highest,lowest,opening,over)
                # go to next page
                driver.find_elements_by_xpath("//a[@class='next paginate_button']")[-1].click()
                time.sleep(5)

        except Exception as e:
            print(e)

instance = stock_pack().initialize()

代码思路
所有的题目代码思路都很简单：就是找到想要的数据到底在哪，然后一步步接近就可以了，很明显这个题目我们需要的数据在一个表格中，只要看懂表格的 HTML 源码那么这些数据就是手到擒来的。这次的作业就是在上次我提交的作业的基础上用类进行了封装，其他功能基本一致。

第三题

第三题很考验对 selenium 的掌握，不仅仅是考察 xpath ,还考察 selenium 中对 driver 的页面跳转的理解。

代码思路
根据题目要求需要先进行登录，然后输入关键词进行搜索，在从课程列表中爬取出想要的信息。所以代码的编写也是分为这三步，第一步登录比较简单，其实就是对 xpath 的简单考察，找到相应的元素来写 xpath 即可。但是呢这个地方有个坑，就是 selenium 不能在不同的 frame 里跳转，需要手动来找到需要爬取的信息在哪个 frame 然后跳转；第二步就是找到搜索栏输入关键字也是对 xpath 的考察；第三步爬取相应信息除了对 xpath 的考察外还需要使 driver 在不同的标签页进行跳转，需要自己踩坑。
代码运行流程

本代码的运行流程是打开 mooc 的网站然后先关闭弹窗(弹窗可能已经消失，只需要把login函数中关闭弹窗的语句注释掉即可)，寻找登录框，点击用手机号登录，自动输入手机和密码，自动在搜索框输入关键词，搜索之后点击列表里每个项爬取信息，翻页，重复爬取，之后存入数据库，啊哈一气呵成~

先上运行结果，把爬到的信息先输出了
再贴一下数据库中的结果
源代码：

import selenium
from selenium import webdriver
import pymssql
import time

class MyConsumingSpider():
    driver = webdriver.Edge("C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")
    url = "https://www.icourse163.org/"
    # you can change your keyword,it should work normally
    keyword = "深度学习"
    cnt = 0

    def login_and_go(self):
        # usage : open the source page and login into your account then search with keyword
        self.driver.get(self.url)
        self.driver.maximize_window()
        time.sleep(2)
        # close the evil popup，but it may not show up
        self.driver.find_element_by_xpath("//i[@class='ux-icon ux-icon-close']").click()
        
        self.driver.find_element_by_xpath("//a[@class='f-f0 navLoginBtn']").click() # choose login
        time.sleep(2)
        self.driver.find_element_by_xpath("//div[@class='ux-login-set-scan-code_ft']//span[@class='ux-login-set-scan-code_ft_back']").click() # choose other way
        time.sleep(2)
        self.driver.find_elements_by_xpath("//div[@class='ux-tabs-underline']//ul[@class='ux-tabs-underline_hd']//li")[1].click() # choose login with tel
        time.sleep(2)

        # when page have many iframes you need to use selenium with switch_to_iframe to go to the iframe what you want
        temp_iframe_id = self.driver.find_elements_by_tag_name('iframe')[1].get_attribute('id') # choose iframe what you want
        self.driver.switch_to_frame(temp_iframe_id)

        # set username and password to login
        self.driver.find_element_by_xpath("//input[@id='phoneipt']").send_keys("******")
        time.sleep(1)
        self.driver.find_element_by_xpath("//input[@class='j-inputtext dlemail']").send_keys("******")
        time.sleep(0.5)
        self.driver.find_element_by_xpath("//div[@class='f-cb loginbox']//a[@class='u-loginbtn btncolor tabfocus ']").click()
        time.sleep(5)

        # search with keyword
        self.driver.find_element_by_xpath("//div[@class='u-baseinputui']//input[@name='search']").send_keys(self.keyword)
        time.sleep(1)
        self.driver.find_element_by_xpath("//span[@class='u-icon-search2 j-searchBtn']").click()
        time.sleep(3)

    def insert_data_into_DB(self):
        # usage : find the element we need and insert into your DB
        res = self.driver.find_elements_by_xpath("//div[@id='j-courseCardListBox']//div[@class='m-course-list']//div//div[@class='u-clist f-bgw f-cb f-pr j-href ga-click']")
        # when you click other page,dont forget to use window_switch to change your new page,otherwise you will turn around in your primitive page! f**king bug again!
        for r in res:
            # open the pages we need in advance
            r.click()
            time.sleep(1)
        window = self.driver.window_handles
        for i in range(len(window)):
            if (i != len(window) - 1): # should not be our main_page
                # extract parameter
                self.cnt += 1
                self.driver.switch_to_window(window[len(window) - i - 1])
                time.sleep(3)
                name = self.driver.find_element_by_xpath("//span[@class='course-title f-ib f-vam']").text.strip()
                person = self.driver.find_elements_by_xpath("//div[@class='um-list-slider_con_item']")
                first_teacher = person[0].find_element_by_xpath("//div[@class='cnt f-fl']//h3[@class='f-fc3']").text.strip()
                team_member = ""
                for j in range(1, len(person)):
                    team_member += person[j].text.strip().split("\n")[0]
                    team_member += " "
                hot = self.driver.find_element_by_xpath("//span[@class='course-enroll-info_course-enroll_price-enroll_enroll-count']").text.strip()
                note = self.driver.find_element_by_xpath("//div[@class='course-heading-intro_intro']").text.strip()
                ntime = self.driver.find_element_by_xpath("//div[@class='course-enroll-info_course-info_term-info_term-time']").text.strip()
                self.cursor.execute("insert into mooc (course_idx,course_name,course_teacher,course_team,course_hot,course_time,course_note) VALUES (%s,%s,%s,%s,%s,%s,%s)",(self.cnt,name,first_teacher,team_member,hot,ntime,note))
                print(self.cnt,name, first_teacher, team_member, hot, ntime, note)
        # switch among your pages
        for i in range(1,len(window)):
            self.driver.switch_to_window(window[i])
            time.sleep(0.5)
            self.driver.close()
        # back to main_page and prepare to turn to next page
        window = self.driver.window_handles
        self.driver.switch_to_window(window[-1])
        
    def connect_to_SQLSERVER(self):
        # usage : connect to DB
        self.conn = pymssql.connect(host="localhost", user="sa", password="******", database="xrfdb")
        self.cursor = self.conn.cursor()
        self.cursor.execute("delete from mooc")

    def close_DB(self):
        # usage : close your DB
        self.conn.commit()
        self.conn.close()

    def start_my_spider(self):
        # main
        self.connect_to_SQLSERVER()
        self.login_and_go()
        for i in range(3):
            self.insert_data_into_DB()
            # turn to next page
            self.driver.find_element_by_xpath("//li[@class='ux-pager_btn ux-pager_btn__next']//a[@class='th-bk-main-gh']").click()
            time.sleep(5)
        self.close_DB()

ms = MyConsumingSpider().start_my_spider()

总结
这个 mooc 项目的代码量其实不大，但是自己做的有点久，是因为之前不想在列表中点击每个课程去获取它的详情信息，直接在课程列表页面做然后在那个页面翻页（说白了就是懒/(ㄒoㄒ)/~~），但是做着做着发现最后一个需要的字段找不到，还得点进去课程详情找信息，就又重写了一遍主干部分，增加了许多奇奇怪怪的知识，所以对于 selenium 这种自动化测试工具确实有必要花时间来试试毒(ง •_•)ง。
所以这个项目的第一个收获是深入了解了 xpath 的写法，以及 selenium 内置的一些自动化测试的函数以及工作原理，并且加深了对数据库存取和读写操作的认识，知道了要完整的从一个页面来爬取信息需要经历的步骤。
第二个收获是从这个项目中解决了一个之前未解决的问题，之前做过用 selenium 来爬取网抑云评论制作词云，由于网易云的翻页是在单页面加载的，难度其实比这个作业要小，当时卡在了翻页，从这次作业中完美解决，算是一箭双雕吧~
关于代码
一个完整的代码需要经过多次的测试和修改，比如我昨晚刚写完今天来跑突然跑不动了，找了下原因发现是因为今天 mooc 网站在首页给爷加了个弹窗，导致无法自动点击登录按钮，所以就得不断补充代码来维护原有代码(但是过一两天弹窗又会消失)。其次我的代码还有一个较大的缺陷就是没有使用 try-except 来捕捉异常，这是不可取的，有时间再继续添加叭~

posted @ 2020-11-12 14:37 King_James23 阅读(209) 评论(0) 收藏举报

刷新页面返回顶部

King_James23

数据采集与融合第五次个人作业

写在前面

第一题

第二题

第三题

公告