数据采集技术第五次作业

作业①

1)要求:

熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架爬取京东商城某类商品信息及图片。
候选网站:http://www.jd.com/
代码部分

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time

class MySpider:
    headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
    imagePath = "download"

    def startUp(self, url, key):
        # # Initializing Chrome browser
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=chrome_options)

        # Initializing variables
        self.threads = []
        self.No = 0
        self.imgNo = 0
        self.page_num = 1   #爬取页面初始化
        # Initializing database
        try:
            self.con = sqlite3.connect("phones.db")
            self.cursor = self.con.cursor()
            try:
                # 如果有表就删除
                self.cursor.execute("drop table phones")
            except:
                pass
            try:
                #  建立新的表
                sql = "create  table  phones  (mNo  varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))"
                self.cursor.execute(sql)
            except:
                pass

        except Exception as err:
            print(err)
            # Initializing images folder
        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img)
                os.remove(s)
        except Exception as err:
            print(err)
        self.driver.get(url)
        keyInput = self.driver.find_element_by_id("key")
        keyInput.send_keys(key)
        keyInput.send_keys(Keys.ENTER) 

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print(err);    

    def insertDB(self, mNo, mMark, mPrice, mNote, mFile):
        try:
            sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (?,?,?,?,?)"
            self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
        except Exception as err:
            print(err) 

    def showDB(self):
        try:
            con = sqlite3.connect("phones.db")
            cursor = con.cursor()
            print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note"))
            cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")
            rows = cursor.fetchall()
            for row in rows:
                print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4]))
            con.close()
        except Exception as err:
            print(err)

    def download(self, src1, src2, mFile):
        data = None
        if src1:
            try:
                req = urllib.request.Request(src1, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if not data and src2:
            try:
                req = urllib.request.Request(src2, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)  

    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            for li in lis:
                # We find that the image is either in src or in data-lazy-img attribute
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except:
                    src1 = ""

                try:
                    src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"

                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东\n", "")
                    note = note.replace(",", "")

                except:
                    note = ""
                    mark = ""
                self.No = self.No + 1
                no = str(self.No)
                while len(no) < 6:
                    no = "0" + no
                print(no, mark, price)
                if src1:
                    src1 = urllib.request.urljoin(self.driver.current_url, src1)
                    p = src1.rfind(".")
                    mFile = no + src1[p:]
                elif src2:
                    src2 = urllib.request.urljoin(self.driver.current_url, src2)
                    p = src2.rfind(".")
                    mFile = no + src2[p:]
                if src1 or src2:
                    T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.insertDB(no, mark, price, note, mFile)
            try:
                self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
            except:
                nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                time.sleep(10)
                #设置爬取页数为5次
                if(self.page_num<=4):
                    self.page_num +=1
                    nextPage.click()
                    self.processSpider()
        except Exception as err:
            print(err)       

    def executeSpider(self, url, key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")

url = "http://www.jd.com"
spider = MySpider()
while True:
    print("1.爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择(1,2,3):")
    if s == "1":
        spider.executeSpider(url, "手机")
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

结果展示

2)心得体会

第一题主要是复现老师的代码,花了不少时间才算是大致理解了老师的代码(老师不愧是老师)。有了第一题的基础,后面两题的实现算是大致都有了思路,通过第一题,我对selenium的运用更加了解了。

作业②

1)要求:

熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board
代码部分

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
import datetime
import time
from lxml import etree

class MySpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
    }
    count = 1
    def startUp(self,url):
        # # Initializing Chrome browser
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=chrome_options)
        try:
            self.con = pymysql.connect(host = "127.0.0.1",port = 3306,user = "root",passwd = "********",db = "mydb",charset = "utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            #self.cursor.execute("delete from stock") #如果表已经存在就删掉
            # self.cursor.execute("create table stock(id int primary key,"
            #                                        "stockNo varchar(20),"
            #                                        "name varchar(20),"
            #                                        "new_price varchar(20),"
            #                                        "up_rate varchar(20),"
            #                                        "up_num varchar(20),"
            #                                        "turnover varchar(20),"
            #                                        "turnover_num varchar(20),"
            #                                        "change_ varchar(20),"
            #                                        "high varchar(20),"
            #                                        "low varchar(20),"
            #                                        "today varchar(20),"
            #                                        "yestoday varchar(20))")
            self.opened = True
            self.page_num = 1
        except Exception as err:
            print(err)
            self.opened = False
        self.driver.get(url)

    def closeUp(self):
        try:
            if(self.opened):
                self.con.commit()
                self.con.close()
                self.opened = False
            self.driver.close()
            print("爬取完毕,closed")
        except Exception as err:
            print("关闭数据库失败",err)

    def insertDB(self,number,id,stockNo,name,new_price,up_rate,up_num,turnover,turnover_num,change_,high,low,today,yestoday):
        try:
            self.cursor.execute("insert into stock(number,id,stockNo,name,new_price,up_rate,up_num,turnover,turnover_num,change_,high,low,today,yestoday) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
            (str(self.count),id,stockNo,name,new_price,up_rate,up_num,turnover,turnover_num,change_,high,low,today,yestoday))
        except Exception as err:
            print("插入失败",err)

    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            tr_list = self.driver.find_elements_by_xpath("//*[@id='table_wrapper-table']/tbody/tr")
            for tr in tr_list:
                id = tr.find_element_by_xpath('./td[1]').text
                stockNo = tr.find_element_by_xpath('./td[2]').text
                name = tr.find_element_by_xpath('./td[3]').text
                new_price = tr.find_element_by_xpath('./td[5]').text
                up_rate  = tr.find_element_by_xpath('./td[6]').text
                up_num  = tr.find_element_by_xpath('./td[7]').text
                turnover  = tr.find_element_by_xpath('./td[8]').text
                turnover_num  = tr.find_element_by_xpath('./td[9]').text
                change_  = tr.find_element_by_xpath('./td[10]').text
                high = tr.find_element_by_xpath('./td[11]').text
                low = tr.find_element_by_xpath('./td[12]').text
                today = tr.find_element_by_xpath('./td[13]').text
                yestoday = tr.find_element_by_xpath('./td[14]').text
                self.insertDB(str(self.count),id,stockNo,name,new_price,up_rate,up_num,turnover,turnover_num,change_,high,low,today,yestoday)
                self.count += 1
            try:
                self.driver.find_element_by_xpath('//*[@id="main-table_paginate"]/a[@class="next paginate_button disabled"]')
            except:
                next_page = self.driver.find_element_by_xpath('//*[@id="main-table_paginate"]/a[@class="next paginate_button"]')
                time.sleep(3)
                next_page.click()
                #time.sleep(5)
                if(self.page_num<3):
                    self.page_num += 1
                    self.processSpider()
        except Exception as err:
            print(err)

myspider = MySpider()
data = {
    "沪深A股":"gridlist.html#hs_a_board",
    "上圳A股":"gridlist.html#sh_a_board",
    "深圳A股":"gridlist.html#sz_a_board"
}
starttime = datetime.datetime.now()
for key in data.keys():
    url = "http://quote.eastmoney.com/center/"
    print("正在爬取"+key+"板块的股票")
    url = url + data[key]
    print("开始爬虫...")
    myspider.startUp(url)
    print("正在爬虫...")
    myspider.processSpider()
    myspider.closeUp()
endtime = datetime.datetime.now()
total_time = endtime - starttime
print("结束爬虫,"+"一共耗时"+str(total_time)+"秒")
print("一共耗时"+str(myspider.count-1)+"条数据");

结果展示


2)心得体会

第二题的话,首先有个疑问:不知道为啥我创建表的命令实现不了?所以只能最后在mysql中创建表格,后续的进展都比较顺利,就是要爬取的项有点多,给列取名取到头秃,还有一个踩到的坑就是在测试爬取情况的时候,没有添加关闭函数,导致爬取插入的数据没有被更新到数据库中,花费了不少时间才发现这个问题。

作业③

1)要求:

熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
候选网站:中国mooc网:https://www.icourse163.org
代码部分

from time import daylight, sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
import datetime
import time
from selenium.webdriver.common.keys import Keys
from lxml import etree

class MySpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
    }
    count = 1

    def startUp(self,url):
        # # Initializing Chrome browser
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=chrome_options)
        try:
            self.con = pymysql.connect(host = "127.0.0.1",port = 3306,user = "root",passwd = "********",db = "mydb",charset = "utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            #self.cursor.execute("delete from stock") #如果表已经存在就删掉
            self.opened = True
            self.page_num = 1
        except Exception as err:
            print("连接数据库失败")
            self.opened = False
        self.driver.get(url)
        #这里的search框用id属性显示定位不到
        #1.如果一个元素,它的属性不是很明显,无法直接定位到,这时候我们可以先找它老爸(父元素)
        #2.找到它老爸后,再找下个层级就能定位到了
        #3.要是它老爸的属性也不是很明显
        time.sleep(2)
        accession = self.driver.find_element_by_xpath('//*[@id="app"]/div/div/div[1]/div[3]/div[3]/div')
        # print(accession)
        # time.sleep(3)
        accession.click()
        time.sleep(3)
        #其它登入方式
        other = self.driver.find_element_by_xpath('//div[@class="ux-login-set-scan-code_ft"]/span')
        other.click()
        time.sleep(5)
        self.driver.switch_to.frame(self.driver.find_element_by_xpath("//iframe[starts-with(@id, 'x-URS-iframe')]"))
        email = self.driver.find_element_by_xpath('//input[@name="email"]')
        email.send_keys('2105114977@qq.com')
        time.sleep(2)
        pswd = self.driver.find_element_by_xpath('//input[@name="password"]')
        pswd.send_keys('********')
        time.sleep(2)
        go = self.driver.find_element_by_xpath('//div[@class="f-cb loginbox"]/a')
        go.click()
        time.sleep(5)
        search = self.driver.find_element_by_xpath('//div[@class="u-baseinputui"]/input[@type="text"]')  #找到搜索的input框
        time.sleep(3)
        search.send_keys(key)
        search.send_keys(Keys.ENTER) 
      

    def closeUp(self):
        try:
            if(self.opened):
                self.con.commit()
                self.con.close()
                self.opened = False
            self.driver.close()
            print("爬取完毕,关闭数据库")
        except Exception as err:
            print("关闭数据库失败")

    def insertDB(self,id,Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief):
        try:
            self.cursor.execute("insert into mooc(id,Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief) values (%s,%s,%s,%s,%s,%s,%s,%s)",
            (str(self.count),Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief))
        except Exception as err:
            print("插入数据失败!",err)
    
    def processSpider(self):
        time.sleep(3)
        print(self.driver.current_url)
        div_list = self.driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div[@class="u-clist f-bgw f-cb f-pr j-href ga-click"]')
        #print(len(div_list))
        for div in div_list:
            Mcourse = div.find_element_by_xpath('.//div[@class="t1 f-f0 f-cb first-row"]').text
            Mcollege = div.find_element_by_xpath('.//a[@class="t21 f-fc9"]').text
            Mteacher = div.find_element_by_xpath('.//a[@class="f-fc9"]').text
            Mcount = div.find_element_by_xpath('.//span[@class="hot"]').text
            #详情页,这里通过点击图片来实现
            detail = div.find_element_by_xpath('.//div[@class="u-img f-fl"]')
            detail.click()
            time.sleep(2)  #等待详情页面加载,准备爬取我们所需要的其它内容
            windows = self.driver.window_handles  #获取当前所有页面句柄
            self.driver.switch_to.window(windows[1])  #切换到刚刚点击的新页面
            teams = self.driver.find_elements_by_xpath('//div[@class="um-list-slider_con_item"]//h3[@class="f-fc3"]')
            Mteam = ""
            for team in teams:
                Mteam = Mteam + team.text
                if(team!=teams[-1]):
                    Mteam += " "
            #print(Mteam)  #测试团队爬取情况
            Mprocess = self.driver.find_element_by_xpath('//div[@class="course-enroll-info_course-info_term-info"]//div[@class="course-enroll-info_course-info_term-info_term-time"]/span[2]').text
            Mbrief = self.driver.find_element_by_xpath('//div[@class="course-heading-intro_intro"]').text
            #print(Mprocess,Mbrief)
            
            self.driver.close()  #把详情页窗口关闭
            self.driver.switch_to.window(windows[0])  #切换回最初的页面
            #通过观察浏览器的行为,确实可以爬取到我们所需要的所有信息了,接下来只要把我们的数据存储到数据库即可
           
            #print(self.count,Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief)
            self.insertDB(str(self.count),Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief)
            self.count += 1
        #爬取完毕一个页面的数据了,这里就爬取2页的数据
        #这题先找disable的话在第一页会先点击上一页
        try:
            next_button = self.driver.find_element_by_xpath('//li[@class="ux-pager_btn ux-pager_btn__next"]//a[@class="th-bk-main-gh"]')
            #print(next_button)
            next_button.click()
            time.sleep(2)
            if(self.page_num<2):
                self.page_num += 1 
                self.processSpider()
        except:
            self.driver.find_element_by_xpath('//li[@class="ux-pager_btn ux-pager_btn__next"]//a[@class="th-bk-disable-gh"]')  #当前页面是否是最后一页
            

key = "机器学习"
url = "https://www.icourse163.org"
myspider = MySpider()
start = datetime.datetime.now()
myspider.startUp(url)
myspider.processSpider()
myspider.closeUp()
end = datetime.datetime.now()
print("一共爬取了"+str(myspider.count-1)+"条数据,"+"一共花费了"+str(end-start)+"的时间")

结果展示


2)心得体会

这次作业的坑就有点多了。首先是模拟登入,在进行xpath解析时,有的元素定位不到,但是xpath其实是没有错的,这时候可以先定位到它的父节点,如果还不行,就再往上定位。然后模拟登入的账号密码框是嵌套在iframe中,所以要先进入iframe中再定位才能定位到账号框和密码框。第二个就是又踩了上一题的坑,明明在存储到数据库的过程中没有任何报错信息,发现又是没有写colse函数,没有commit。还有一个就是点击页面详情页,一开始我以为直接click之后就可以开始爬取了,但实际上此时仍处于最初的窗口,要通过来回切换窗口来进行数据捕获。经过三次实验,算是对selenium和数据库的存储方面的运用都了解更深了。
iframe的xpath定位参考博客
窗口切换的参考博客

posted @ 2020-11-20 22:43  ╮兲兲快泺╭  阅读(139)  评论(0编辑  收藏  举报