第五次作业

作业一

要求

熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架爬取京东商城某类商品信息及图片。

代码

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time
import pymysql

class MySpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}

    imagePath = "download"
    #No=0
    def startUp(self, url, key):
        # # Initializing Chrome browser
        #chrome_options = Options()
        #chrome_options.add_argument('--headless')
        #chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome()#chrome_options=chrome_options)

        # Initializing variables1

        self.threads = []
        self.No = 0
        self.imgNo = 0
        # Initializing database
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from computer")
        except Exception as err:
            print(err)
        # Initializing images folder
        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img)
                os.remove(s)
        except Exception as err:
            print(err)

        self.driver.get(url)
        keyInput = self.driver.find_element_by_id("key")
        keyInput.send_keys(key)
        #在键盘输入enter
        keyInput.send_keys(Keys.ENTER)

    def closeUp(self):
        try:

            self.con.commit()
            self.con.close()
            self.driver.close()

        except Exception as err:
            print(err);


    def insertDB(self, mNo, mMark, mPrice, mNote, mFile):
        try:
            self.cursor.execute(
                "insert into computer (mNo,mMark,mPrice,mNote,mFile) values (%s,%s,%s,%s,%s)",
                (mNo, mMark, mPrice, mNote, mFile))
        except Exception as err:
            print(err)


    def showDB(self):
        try:
            con = sqlite3.connect("phones.db")
            cursor =con.cursor()
            print("%-8s%-16s%-8s%-16s%s"%("No", "Mark", "Price", "Image", "Note"))
            cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")

            rows = cursor.fetchall()
            for row in rows:
                print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3],row[4]))

            con.close()
        except Exception as err:
            print(err)



    def download(self, src1, src2, mFile):
        data = None
        if src1:
            try:
                req = urllib.request.Request(src1, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if not data and src2:
            try:
                req = urllib.request.Request(src2, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)


    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis =self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            for li in lis:
                # We find that the image is either in src or in data-lazy-img attribute
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except:
                    src1 = ""

                try:
                    src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""


                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"

                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东\n", "")
                    note = note.replace(",", "")

                except:
                    note = ""
                    mark = ""

                self.No = self.No + 1
                no = str(self.No)
                while len(no) < 6:
                    no = "0" + no
                print(no, mark, price)
                if src1:
                    src1 = urllib.request.urljoin(self.driver.current_url, src1)
                    p = src1.rfind(".")
                    mFile = no + src1[p:]
                elif src2:
                    src2 = urllib.request.urljoin(self.driver.current_url, src2)
                    p = src2.rfind(".")
                    mFile = no + src2[p:]
                if src1 or src2:
                    T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.insertDB(no, mark, price, note, mFile)
         # 取下一页的数据，直到最后一页
            try:
                self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
            except:
                nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                time.sleep(10)
                #if(self.No<50):
                nextPage.click()
                self.processSpider()
        except Exception as err:
            print(err)

    def executeSpider(self, url, key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")


url = "http://www.jd.com"
spider = MySpider()
while True:
    print("1.爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择(1,2,3):")
    if s == "1":
        spider.executeSpider(url, "电脑")
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

运行结果截图

心得体会

1.熟悉了selenium的流程与使用，大致分为：创建一个浏览器对象，通过指定url访问页面，操作页面并获取页面信息，关闭浏览器对象。
2.更加熟悉了python中连接mysql数据库的流程
3.加强了对F12的理解和对xpath提取信息的使用

作业二

要求

熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。

候选网络

东方财富网：https://www.eastmoney.com/

思路

1.建立一个谷歌浏览器对象，建立与mysql数据库的连接
2.指定要爬取网页的url
3.访问网页，通过xpath爬取股票，并通过翻页处理爬取三页的股票信息
4.通过访问相应的url分别爬取三种类型的股票
5.将爬取的股票信息存储进mysql数据库中
6.关闭浏览器对象，结束与mysql的连接

代码


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import pymysql

class getStocks:
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)"
                            "Gecko/2008072421 Minefield/3.0.2pre"
    }
    num = 1
    def startUp(self,url):
        #建立浏览器对象
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=chrome_options)
        try:
            #与mysql建立连接
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd="123456", db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from stocks")
        except Exception as err:
            print(err)
        self.driver.get(url)

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print(err)

    def insertDB(self,id,bCode,bName,bLatestPrice,bUpDownRange,bUpDownPrice,bTurnover,bTurnoverNum,bAmplitude,bHighest,bLowest,bToday,bYesterday):
        try:
            print(id,bCode,bName,bLatestPrice,bUpDownRange,bUpDownPrice,bTurnover,bTurnoverNum,bAmplitude,bHighest,bLowest,bToday,bYesterday)
            self.cursor.execute("insert into stocks (id,bCode,bName,bLatestPrice,bUpDownRange,bUpDownPrice,bTurnover,"
                                "bTurnoverNum,bAmplitude,bHighest,bLowest,bToday,bYesterday) values "
                                "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                                (id,bCode,bName,bLatestPrice,bUpDownRange,bUpDownPrice,bTurnover,bTurnoverNum,bAmplitude,bHighest,bLowest,bToday,bYesterday))
        except Exception as err:
            print(err)

    def processSpider(self):
        try:
            time.sleep(1)
            #输出当前url
            print(self.driver.current_url)
            trs =self.driver.find_elements_by_xpath("//div[@class='listview full']/table[@id='table_wrapper-table']/tbody/tr")
            #print(trs)
            for tr in trs:
                #通过xpath获取需要的信息
                id = tr.find_element_by_xpath("./td[position()=1]").text
                bCode = tr.find_element_by_xpath("./td[position()=2]/a").text
                bName = tr.find_element_by_xpath("./td[position()=3]/a").text
                bLatestPrice = tr.find_element_by_xpath("./td[position()=5]/span").text
                bUpDownRange = tr.find_element_by_xpath("./td[position()=6]/span").text
                bUpDownPrice = tr.find_element_by_xpath("./td[position()=7]/span").text
                bTurnover = tr.find_element_by_xpath("./td[position()=8]").text
                bTurnoverNum = tr.find_element_by_xpath("./td[position()=9]").text
                bAmplitude = tr.find_element_by_xpath("./td[position()=10]").text
                bHighest = tr.find_element_by_xpath("./td[position()=11]/span").text
                bLowest = tr.find_element_by_xpath("./td[position()=12]/span").text
                bToday = tr.find_element_by_xpath("./td[position()=13]/span").text
                bYesterday = tr.find_element_by_xpath("./td[position()=14]").text
                self.insertDB(id,bCode,bName,bLatestPrice,bUpDownRange,bUpDownPrice,bTurnover,bTurnoverNum,bAmplitude,bHighest,bLowest,bToday,bYesterday)
         # 翻页操作，翻至最后一页，但是这里只爬取了三页
            try:
                self.driver.find_element_by_xpath("//div[@class='dataTables_wrapper']//div[@class='dataTables_paginate paging_input']//a[@class='next paginate_button disabled']")
            except:
                nextPage = self.driver.find_element_by_xpath("//div[@class='dataTables_wrapper']//div[@class='dataTables_paginate paging_input']//a[@class='next paginate_button']")
                time.sleep(10)
                self.num += 1
                if(self.num<4):
                    nextPage.click()
                    self.processSpider()
        except Exception as err:
            print(err)

    def executeSpider(self, url):
        print("Spider starting......")
        self.startUp(url)
        print("Spider processing......")
        #分别爬取三种股票
        print("沪深A股")
        self.processSpider()

        self.num =1
        url = "http://quote.eastmoney.com/center/gridlist.html#sh_a_board"
        self.driver.get(url)
        print("上证A股")
        self.processSpider()

        self.num = 1
        url = "http://quote.eastmoney.com/center/gridlist.html#sz_a_board"
        self.driver.get(url)
        print("深证A股")
        self.processSpider()

        print("Spider closing......")
        self.closeUp()

url = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"
spider = getStocks()
spider.executeSpider(url)
'''use mydb;
create table stocks
(id varchar(128),
bCode varchar(128),
bName varchar(128),
bLatestPrice varchar(128),
bUpDownRange varchar(128),
bUpDownPrice varchar(128),
bTurnover varchar(128),
bTurnoverNum varchar(128),
bAmplitude varchar(128),
bHighest varchar(128),
bLowest varchar(128),
bToday varchar(128),
bYesterday varchar(128));'''

运行结果截图

心得体会

1.更加熟悉了selenium框架，对元素的定位也更加熟练
2.在打这题的代码过程中由于不小心将element打成了elements，一直报错，最后还是同学提醒了才发现。在之后的编码过程中要更加的细心，注意在一些细节的地方多留心。

作业三

要求

熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息（课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介）

候选网站

中国mooc网：https://www.icourse163.org

思路

1.建立浏览器对象，连接mysql数据库
2.通过url访问页面，进行登入操作
3.通过搜索，查找想要爬取的课程信息
4.逐个点击课程，通过跳转页面来获取课程中的信息
5.关闭课程页面，返回总课程页面，继续之后课程的爬取
6.进行翻页操作，爬取更多的课程信息
7.结束爬取，关闭浏览器，将爬取到的数据存储到数据库中

代码

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import pymysql
import re

class getMoocs:
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)"
                            "Gecko/2008072421 Minefield/3.0.2pre"
    }
    no = 1
    page = 1
    def startUp(self,url):
        '''chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')'''
        self.driver = webdriver.Chrome()#options=chrome_options)
        try:
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd="123456", db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            #self.cursor.execute("delete from moocs")
        except Exception as err:
            print(err)
        self.driver.get(url)

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print(err)

    def signUp(self):
        time.sleep(5)
        #登入
        self.driver.find_element_by_xpath("//div[@class='m-navTop-func']//div[@class='u-navLogin-loginBox']//div[@class='m-navlinks']").click()
        time.sleep(1)
        self.driver.find_element_by_xpath("//div[@class='ux-login-set-scan-code_ft']/span[@class='ux-login-set-scan-code_ft_back']").click()
        time.sleep(5)
        #self.driver.find_elements_by_xpath("//div[@class='ux-tabs-underline']//li")[1].click()
        # 切换次页面
        time.sleep(5)
        self.driver.switch_to.frame(self.driver.find_element_by_xpath("//div[@class='ux-login-set-container']/iframe"))
        self.driver.find_element_by_id("account-box").click()
        self.driver.find_element_by_xpath("//div[@class='inputbox active']//input").send_keys("1193483128@qq.com")
        self.driver.find_element_by_xpath("//div[@class='inputbox']").click()
        self.driver.find_elements_by_xpath("//div[@class='inputbox active']//div[@class='u-input box']/input")[1].send_keys("000SUJINGZE")
        #time.sleep(10)
        self.driver.find_element_by_xpath("//div[@class='f-cb loginbox']").click()
        time.sleep(10)

    def getcourse(self):
        #搜索想要爬取的课程
        self.driver.find_element_by_xpath("//div[@class='u-baseinputui']/input").send_keys('java')
        self.driver.find_element_by_xpath("//div[@class='u-search-icon']").click()
        #获取当前页中的所有课程对象
        divs = self.driver.find_elements_by_xpath("//div[@data-action='课程点击']")
        for div in divs:
            time.sleep(3)
            div.click()
            #切换页面
            latest = self.driver.window_handles[-1]
            self.driver.switch_to.window(latest)
            #打印当前url
            print(self.driver.current_url)
            id = self.no
            self.no += 1
            time.sleep(3)
            #爬取需要的数据
            cCourse = self.driver.find_element_by_xpath("//div[@id='g-body']//span[@class='course-title f-ib f-vam']").text
            cCollege = self.driver.find_element_by_xpath("//a[@data-action='点击学校logo']/img[@class='u-img']").get_attribute('alt')
            cTeacher = self.driver.find_element_by_xpath("//div[@data-action='点击课程团队头像']/div/h3").text
            teachers = self.driver.find_elements_by_xpath("//div[@class='um-list-slider_con']/div")
            cTeam = ''
            for i in range(len(teachers)):
                name = teachers[i].find_element_by_xpath("./div//img").get_attribute("alt")
                if i != len(teachers)-1:
                    cTeam += name + ','
                else:
                    cTeam += name
            #通过正则表达式提取字符串中的数字信息
            cCount = self.driver.find_element_by_xpath("//div[@class='course-enroll-info_course-enroll']/div/span").text
            cCount = re.findall(r'\d+', cCount)[0]
            cProcess = self.driver.find_element_by_xpath("//div[@id='course-enroll-info']/div/div[1]/div[2]/div[1]/span[2]").text
            cBrief = self.driver.find_element_by_xpath("//div[@id='j-rectxt2']").text
            self.driver.close()
            #跳转回之前的页面
            latest = self.driver.window_handles[0]
            self.driver.switch_to.window(latest)
            self.insertDB(id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief)
        if self.page<3:
            self.driver.find_element_by_xpath("//div[@id='j-courseCardListBox']/div[2]/ul/li[10]").click()
            self.page += 1
            self.getcourse()


    def insertDB(self,id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief):
        try:
            print(id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief)
            self.cursor.execute("insert into moocs (id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief) values (%s,%s,%s,%s,%s,%s,%s,%s)",
                                (id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief))
        except Exception as err:
            print(err)

    def executeSpider(self, url):
        print("Spider starting......")
        #建立浏览器对象，连接数据库
        self.startUp(url)
        print("Spider processing......")
        #登入mooc
        self.signUp()
        #爬取课程数据
        self.getcourse()
        #关闭
        self.closeUp()

url = "https://www.icourse163.org/#"
spider = getMoocs()
spider.executeSpider(url)

运行结果截图

心得体会

1.对登入操作有了初步的了解，能够操作浏览器对象进行登入操作，同时知道了登录是要切换iframe
2.学习了如何切换页面
3.通过xpath获取各种信息更加熟练，同时也复习了正则表达式的使用

posted @ 2020-11-21 19:34 苏镜泽阅读(93) 评论(0) 收藏举报

刷新页面返回顶部

苏镜泽

第五次作业

作业一

要求

代码

运行结果截图

心得体会

作业二

要求

候选网络

思路

代码

运行结果截图

心得体会

作业三

要求

候选网站

思路

代码

运行结果截图

心得体会

公告