第五次作业

作业①:

要求:

候选网站:http://www.jd.com/

关键词:学生自由选择

1)爬取京东实验:

2.编写代码:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time
class MySpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.361"}
    imagePath = "download"
    def startUp(self, url, key):
        # # Initializing Chrome browser
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)
        # Initializing variables
        self.threads = []
        self.No = 0
        self.imgNo = 0
        # Initializing database
        try:
            self.con = sqlite3.connect("computer.db")
            self.cursor = self.con.cursor()
            try:
                # 如果有表就删除
                self.cursor.execute("drop table phones")
            except:
                pass
            try:
                sql = "create  table  computer  (mNo  varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))"
                self.cursor.execute(sql)
            except:
                pass
        except Exception as err:
            print(err)
        # Initializing images folder
        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img)
                os.remove(s)
        except Exception as err:
            print(err)
        self.driver.get(url)
        keyInput = self.driver.find_element_by_id("key")
        keyInput.send_keys(key)
        keyInput.send_keys(Keys.ENTER)
    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print(err);
    def insertDB(self, mNo, mMark, mPrice, mNote, mFile):
        try:
            sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (?,?,?,?,?)"
            self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
        except Exception as err:
            print(err)
    def showDB(self):
        try:
            con = sqlite3.connect("phones.db")
            cursor =con.cursor()
            print("%-8s%-16s%-8s%-16s%s"%("No", "Mark", "Price", "Image", "Note"))
            cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")
            rows = cursor.fetchall()
            for row in rows:
                print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3],row[4]))
            con.close()
        except Exception as err:
            print(err)
    def download(self, src1, src2, mFile):
        data = None
        if src1:
            try:
                req = urllib.request.Request(src1, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if not data and src2:
            try:
                req = urllib.request.Request(src2, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)
    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis =self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            for li in lis:
            # We find that the image is either in src or in data-lazy-img attribute
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except:
                    src1 = ""
                try:
                    src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"
                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东\n", "")
                    note = note.replace(",", "")
                except:
                    note = ""
                    mark = ""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"
                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东\n", "")
                    note = note.replace(",", "")
                except:
                    note = ""
                    mark = ""
            self.No = self.No + 1
            no = str(self.No)
            while len(no) < 6:
                no = "0" + no
            print(no, mark, price)
            if src1:
                src1 = urllib.request.urljoin(self.driver.current_url, src1)
                p = src1.rfind(".")
                mFile = no + src1[p:]
            elif src2:
                src2 = urllib.request.urljoin(self.driver.current_url, src2)
                p = src2.rfind(".")
                mFile = no + src2[p:]
            if src1 or src2:
                T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                T.setDaemon(False)
                T.start()
                self.threads.append(T)
            else:
                mFile = ""
            self.insertDB(no, mark, price, note, mFile)
            # 取下一页的数据,直到最后一页
            try:
                self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
            except:
                nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                time.sleep(10)
                nextPage.click()
                self.processSpider()
        except Exception as err:
            print(err)
    def executeSpider(self, url, key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")
url = "http://www.jd.com"
spider = MySpider()
while True:
    print("1.爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择(1,2,3):")
    if s == "1":
        spider.executeSpider(url, "笔记本")
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

爬取结果:


不知道为什么会混入书本的图片

2)心得体会

这主要是ppt中代码的复现。为后面的两个作业奠定框架

作业②

  • 要求:

    • 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
    • 使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
  • 候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board

    1)爬取股票信息实验

    代码部分:

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import pymysql
    import datetime
    import time
    
    
    class MySpider:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.361"
        }
        count = 1
    
        def startUp(self, url):
    
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--disable-gpu')
            self.driver = webdriver.Chrome(chrome_options=chrome_options)
            # Initializing variables
            self.threads = []
            self.No = 0
            self.imgNo = 0
            try:
                self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="mydb",
                                           charset="utf8")
                self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
                self.opened = True
                self.page_num = 1
            except Exception as err:
                print(err)
                self.opened = False
            self.driver.get(url)
    
        def closeUp(self):
            try:
                if (self.opened):
                    self.con.commit()
                    self.con.close()
                    self.opened = False
                self.driver.close()
                print("爬取完毕,closed")
            except Exception as err:
                print("关闭数据库失败", err)
    
        def insertDB(self, Position, id, No, name, price, edu, fudu, cjl, cje, zf, highest,
                     lowest, today, yestoday):
            try:
                self.cursor.execute(
                    "insert into my_stock(Position,id,No,name,price,edu,fudu,cjl,cje,zf,highest,lowest,today,yestoday) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                    (str(self.count), id, No, name, price, edu, fudu, cjl, cje, zf, highest,
                     lowest, today, yestoday))
            except Exception as err:
                print("插入失败", err)
    
        def processSpider(self):
            try:
                time.sleep(1)
                print(self.driver.current_url)
                tr_list = self.driver.find_elements_by_xpath("//*[@id='table_wrapper-table']/tbody/tr")
                for tr in tr_list:
                    id = tr.find_element_by_xpath('./td[1]').text
                    No = tr.find_element_by_xpath('./td[2]').text
                    name = tr.find_element_by_xpath('./td[3]').text
                    price = tr.find_element_by_xpath('./td[5]').text
                    edu = tr.find_element_by_xpath('./td[6]').text
                    fudu = tr.find_element_by_xpath('./td[7]').text
                    cjl = tr.find_element_by_xpath('./td[8]').text
                    cje = tr.find_element_by_xpath('./td[9]').text
                    zf = tr.find_element_by_xpath('./td[10]').text
                    highest = tr.find_element_by_xpath('./td[11]').text
                    lowest = tr.find_element_by_xpath('./td[12]').text
                    today = tr.find_element_by_xpath('./td[13]').text
                    yestoday = tr.find_element_by_xpath('./td[14]').text
                    self.insertDB(str(self.count), id, No, name, price, edu, fudu, cjl, cje,
                                  zf, highest, lowest, today, yestoday)
                    self.count += 1
                try:
                    self.driver.find_element_by_xpath(
                        '//*[@id="main-table_paginate"]/a[@class="next paginate_button disabled"]')
                except:
                    next_page = self.driver.find_element_by_xpath(
                        '//*[@id="main-table_paginate"]/a[@class="next paginate_button"]')
                    time.sleep(3)
                    next_page.click()
                    # time.sleep(5)
                    if (self.page_num < 3):
                        self.page_num += 1
                        self.processSpider()
            except Exception as err:
                print(err)
    
    
    myspider = MySpider()
    data = {
        "沪深A股": "gridlist.html#hs_a_board",
        "上圳A股": "gridlist.html#sh_a_board",
        "深圳A股": "gridlist.html#sz_a_board"
    }
    starttime = datetime.datetime.now()
    for key in data.keys():
        url = "http://quote.eastmoney.com/center/"
        print("正在爬取" + key + "板块的股票")
        url = url + data[key]
        print("开始爬虫...")
        myspider.startUp(url)
        print("正在爬虫...")
        myspider.processSpider()
        myspider.closeUp()
    endtime = datetime.datetime.now()
    total_time = endtime - starttime
    print("结束爬虫," + "一共耗时" + str(total_time) + "秒")
    print("一共耗时" + str(myspider.count - 1) + "条数据");
    

爬取结果:

2)心得体会:

这个还是和作业1类似,再用以前爬股票的相关内容按照1中单框架就可以了。但是设计数据库的时候设置主键会在爬取过程中冲突,就没有设置主键了。

作业③

1)爬取MOOC实验

代码部分:

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from time import sleep
import pymysql
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)

driver.get("https://www.icourse163.org/")
driver.maximize_window()
sleep(2)

driver.find_element_by_xpath('//div[@class="unlogin"]//a[@class="f-f0 navLoginBtn"]').click()   #登录或注册
sleep(2)
driver.find_element_by_class_name('ux-login-set-scan-code_ft_back').click()              #其他登录方式
sleep(2)
driver.find_element_by_xpath("//ul[@class='ux-tabs-underline_hd']//li[@class='']").click()
sleep(2)
driver.switch_to.frame(driver.find_element_by_xpath("//div[@class='ux-login-set-container']//iframe"))
driver.find_element_by_xpath('//input[@id="phoneipt"]').send_keys("18650084388")        #输入账号
sleep(2)
driver.find_element_by_xpath('//input[@placeholder="请输入密码"]').send_keys("wuqilin2000420")   #输入密码
sleep(2)
driver.find_element_by_xpath('//div[@class="f-cb loginbox"]//a[@id="submitBtn"]').click()  #点击登录
sleep(3)
driver.find_element_by_xpath(
    '//div[@class="u-baseinputui"]/input[@class="j-textarea inputtxt"]').send_keys("医学")    #输入要找的课程
sleep(2)
driver.find_element_by_xpath('//div[@class="u-search-icon"]/span[@class="u-icon-search2 j-searchBtn"]').click()   #点击搜索
sleep(2)
conn = None
cursor = None
conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123456', db='mydb', charset='utf8')
cursor = conn.cursor()
id=0
while True:
    sleep(2)
    divs = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')
    for i in range(2):
        try:
            div = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')[i + 1]
            div.click()
            sleep(3)
            current_window = driver.window_handles[-1]   #切换到最新打开的页面
            driver.switch_to.window(current_window)
            sleep(2)
            id += 1
            course = driver.find_element_by_xpath('//span[@class="course-title f-ib f-vam"]').text
            process = driver.find_element_by_xpath(
                '//div[@class="course-enroll-info_course-info_term-info_term-time"]/span[2]').text
            college = driver.find_element_by_xpath('//*[@id="j-teacher"]/div/a/img').get_attribute("alt")
            count = driver.find_element_by_xpath(
                '//span[@class="course-enroll-info_course-enroll_price-enroll_enroll-count"]').text
            brief = driver.find_element_by_xpath('//*[@id="j-rectxt2"]').text
            teacher = driver.find_element_by_xpath('//div[@class="cnt f-fl"][1]/h3').text
            team = ""
            teas = driver.find_elements_by_xpath('//div[@class="um-list-slider_con"]/div')
            if len(teas)>1:
                for tea in teas:
                    team = team + tea.find_element_by_xpath('.//div[@class="cnt f-fl"]/h3').text   #教师拼接
            else:
                team=teacher
            print(course, college, teacher, team, process, brief)
            try:
                cursor.execute('insert into my_mooc values("%s","%s","%s","%s","%s","%s","%s","%s")' %
                               (id, course, college, teacher, team, count, process, brief))  # 插入数据
                conn.commit()
            except:
                conn.rollback()
            driver.close()
            sleep(2)
            previous_window = driver.window_handles[0]   #切换回最开始打开的界面
            sleep(2)
            driver.switch_to.window(previous_window)
            sleep(2)
            try:
                driver.find_element_by_xpath(
                    '//li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-disable-gh"]')
            except:
                driver.find_element_by_xpath(
                    '//li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-main-gh"]').click()
                sleep(3)
        except Exception as e:
            print(e)
driver.quit()
cursor.close()
conn.close()

爬取结果:

爬到一半就报错了

页面卡在了教师介绍页面就不动了,暂时不知道咋解决

posted @ 2020-11-21 23:00  Incwu  阅读(100)  评论(0编辑  收藏  举报