果然我已经把bs4全忘了并且scrapy只记得一半
因为selenium真的方便好玩且功能齐全
所以第一第二题就只能用很丑陋的代码把他勉强实现出来，害。

作业一

1）、爬取豆瓣top250

代码如下：


from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading
from prettytable import PrettyTable
import re


x = PrettyTable(["排名", '电影名称', '导演', '主演', '上映时间','国家','电影类型','评分','评价人数','引用','文件路径'])


def imageSpider(url):
    global threads
    global count
    global name
    try:
        count=0
        urls=[]
        req=urllib.request.Request(url,headers=headers)
        data=urllib.request.urlopen(req)
        data=data.read()
        dammit=UnicodeDammit(data,["utf-8","gbk"])
        data=dammit.unicode_markup
        soup=BeautifulSoup(data,'html.parser')
        images=soup.select("img")
        for image in images:
            try:
                info = soup.find('ol', attrs={'class': "grid_view"})
                info_1=info.find_all('li')[count]
                name=info_1.find('span',attrs={'class':'title'}).text
                director=info_1.find('p',attrs={'class':''}).text
                first=director.split(r":",1)
                first_1=director.split(r':',2)
                first_2=director.split(r'...',1)
                second=first[1].split(r' ')
                second_1=first_1[2].split(r' ')
                second_2=first_2[1].split(r' ')
                second_2 = [item for item in filter(lambda x: x != '', second_2)]
                m=''.join(second_2)
                m.replace(u'\xa0', u' ')
                m.split(' ')
                direct=second[1]
                maind=second_1[1]
                m=m.split(r'/',3)
                time=m[0]
                nation=m[1]
                plot=m[2]
                num=info_1.find('span',attrs={'class':'rating_num'}).text
                num_people=info_1.find_all('span')[7].text
                go=info_1.find('span',attrs={'class':'inq'}).text
                x.add_row([count+1,name,direct,maind,time,nation,plot,num,num_people,go,name+'.jpg'])

            except Exception as err :
                print(err)
                print('err4')


            try:
                src=image["src"]
                url=urllib.request.urljoin(url,src)
                if url not in urls:
                    count=count+1
                    T=threading.Thread(target=download,args=(url,count))
                    T.setDaemon(False)
                    T.start()
                    threads.append(T)
            except:
                print('err')
    except Exception as err:
        print(err)
        print('err1')

def download(url,count):
    try:
        flag=0
        if(url[len(url)-4]=="."):
            ext=url[len(url)-4:]
            if ext=='.png':
                flag=1
        else:
            ext=""
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req, timeout=100)
        data=data.read()
        if flag==0:
            fobj = open("..\文件夹\豆瓣\\" + name + '.jpg', "wb")
            fobj.write(data)
            fobj.close()
            print("downloaded" + str(count) + ext)
    except Exception as err:
        print(err)
        print('err2')

def nextpage():
    try:
        if count%25==0:
            url='https://movie.douban.com/top250?start='+count+'&filter='
            imageSpider(url)
    except:
        print('err3')



url = 'https://movie.douban.com/top250'
headers = {
    'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400'}
count = 0
threads=[]

imageSpider(url)

for i in threads:
    i.join()
print('the end')

print(x)

prettytable：

图片：

2）、心得体会

看了看自己前面的代码，在大框架有的情况下倒是不难
就是中间部分的导演、主演全部写在了一起
所有只能用了我自己都看不下去的丑陋的re表达式把那一大段的句子一点一点的从各种符号中拆分出来
prettytable对的不是很齐，不过不太在意，想要对齐存数据库就好了。

作业二

1）、爬取大学信息

随着接触的框架越来越多（django，scrapy）之类的，一个前人搭建好的框架真的可以节省很多时间。
因为分类太多，加上之前也做过，这里只放上主函数的代码
代码如下：


import scrapy
from ..items import ruankeItem
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver import Firefox
import time
import urllib.request

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3776.400 QQBrowser/10.6.4212.400'}


class MySpider(scrapy.Spider):
    name = 'ruanke'
    count = 0

    def start_requests(self):
        url = 'https://www.shanghairanking.cn/rankings/bcur/2020'
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        fire_option = Options()
        self.driver = Firefox(options=fire_option)
        try:
            self.driver.get("https://www.shanghairanking.cn/rankings/bcur/2020")
            time.sleep(2)
            self.driver.maximize_window()
            a=len(self.driver.find_elements_by_xpath("//table[@class='rk-table']//tbody//tr"))
            for i in range(a):
                self.driver.find_element_by_xpath("//table[@class='rk-table']//tbody//tr[position()="+str(i+1)+"]//a[@class='']").click()
                self.count += 1
                time.sleep(2)
                print(self.driver.current_url)
                time.sleep(2)
                try:
                    item = ruankeItem()
                    item['sNo'] = str(self.count)
                    item['school'] = str(self.driver.find_element_by_xpath('//div[@class="univ-name"]').text)
                    a = self.driver.find_elements_by_xpath('//div[@class="science-rank-text"]')[1].text
                    item['city'] = str(a[:-3])
                    item['official'] = str(self.driver.find_element_by_xpath('//div[@class="univ-website"]/a').text)
                    item['info'] = str(self.driver.find_element_by_xpath("//div[@class='univ-introduce']/p").text)
                    item['mFile'] = str(self.count) + '.jpg'
                    yield item
                except Exception as err:
                    print(err)
                    print('err10')
                try:
                    image=self.driver.find_element_by_xpath("//td[@class='univ-logo']//img").get_attribute('src')
                    req=urllib.request.Request(image)
                    data=urllib.request.urlopen(req)
                    data=data.read()
                    print(image)
                    fobj=open(r"G:\TECENT(3)\tech study\untitled\爬虫\文件夹\example(软科)\img\\"+str(self.count)+'.png',"wb")
                    fobj.write(data)
                    fobj.close()
                except Exception as err:
                    print(err)


                time.sleep(1)
                self.driver.back()
                time.sleep(1)
        except Exception as er:
            print(er)
            print('err11')

数据库：
图片：

2）、心得体会

在scrapy框架下混杂使用了selenium，因为涉及到了页面跳转，而我不太找得出每个url中的关联，所有选择自动化点击。
pipeline中的函数process_item在复写的时候仅仅只是改了个名字成process_spider就害我找了半个小时，框架好用的前提是你理解框架并且不会一不小心破坏它。
这个页面是点击跳转之后之前的界面就消失了，所以不需要window转换,只要简单的back()回去就可以了
剩下问题不大

作业三

1）、selenium自动化登录以及爬取mooc

基本等同于上次做过的作业，就是登陆之后需要点进个人中心，然后有两次的页面跳转，每爬完一个界面需要self.driver.close（）两次
代码如下：


import datetime
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
import sqlite3
import time


class MySpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}

    def startUp(self, url):
        firefox_options = Options()
        self.driver = Firefox(options=firefox_options)
        time.sleep(1)
        self.driver.get(url)
        time.sleep(1)
        self.No = 0
        try:
            self.con = sqlite3.connect("selenium.db")
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute("drop table mooc")
            except:
                pass
            try:
                sql = "create table mooc (id varchar(256),cCource varchar(256),cCollege varchar(256),cTeacher varchar(256),cTeam varchar(256),cCount varchar(256),cProcess varchar(256),cBrief varchar(256))"
                self.cursor.execute(sql)
            except:
                pass
        except:
            print("err0")

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print("err1")
            print(err)

    def insertDB(self, id, cCource, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief):
        try:
            sql = "insert into mooc (id,cCource,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief) values(?,?,?,?,?,?,?,?)"
            self.cursor.execute(sql, (
                id, cCource, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief))
        except:
            print("err3")

    def showDB(self):
        try:
            con = sqlite3.connect("selenium.db")
            cursor = con.cursor()
            print("%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s" % (
            "id", "cCource", "cCollege", "cTeacher", "cTeam", "cCount", "cProcess","cBrief"))
            cursor.execute("select id, cCource, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief from mooc order by id")
            rows = cursor.fetchall()
            for row in rows:
                print("%-16s %-16s %-16s %-16s %-16s %-16s %-16s %-16s" % (
                row[0], row[1], row[2], row[3], row[4], row[5], row[6],row[7]))
            con.close()
        except:
            print("err4")

    def load_mooc(self):
        self.driver.maximize_window()
        load_1=self.driver.find_element_by_xpath("//a[@class='f-f0 navLoginBtn']")
        load_1.click()
        time.sleep(1)
        load_2=self.driver.find_element_by_xpath("//span[@class='ux-login-set-scan-code_ft_back']")
        load_2.click()
        time.sleep(1)
        load_3=self.driver.find_elements_by_xpath("//ul[@class='ux-tabs-underline_hd']//li")[1]
        load_3.click()
        time.sleep(1)
        iframe_id=self.driver.find_elements_by_tag_name("iframe")[1].get_attribute('id')
        self.driver.switch_to.frame(iframe_id)
        self.driver.find_element_by_xpath("//input[@id='phoneipt']").send_keys('13023875560')
        time.sleep(1)
        self.driver.find_element_by_xpath("//input[@class='j-inputtext dlemail']").send_keys('8585asd369')
        time.sleep(1)
        self.driver.find_element_by_xpath("//a[@class='u-loginbtn btncolor tabfocus ']").click()
        time.sleep(5)
        self.driver.get(self.driver.current_url)


    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            print('0')
            self.driver.find_element_by_xpath("//div[@class='u-navLogin-myCourse-t']").click()
            print('1')
            time.sleep(2)
            self.driver.find_element_by_xpath("//div[@class='f-fl f-f0']//a[position()=4]").click()
            time.sleep(2)
            ms=self.driver.find_elements_by_xpath("//div[@class='course-card-wrapper']")
            for m in ms:
                m.click()
                time.sleep(2)
                last_window=self.driver.window_handles[-1]
                self.driver.switch_to.window(last_window)
                self.driver.find_element_by_xpath("//h4[@class='f-fc3 courseTxt']").click()
                last_window=self.driver.window_handles[-1]
                self.driver.switch_to.window(last_window)
                print(self.driver.current_url)
                time.sleep(2)
                try:
                    cCource=self.driver.find_element_by_xpath("//span[@class='course-title f-ib f-vam']").text
                    print(cCource)
                    cCollege = self.driver.find_element_by_xpath("//img[@class='u-img']").get_attribute("alt")
                    print(cCollege)
                    cTeacher = self.driver.find_element_by_xpath("//div[@class='um-list-slider_con']/div[position()=1]//h3[@class='f-fc3']").text
                    print(cTeacher)
                    z=0
                    cTT = []
                    while(True):
                        try:
                            cTeam = self.driver.find_elements_by_xpath(
                                "//div[@class='um-list-slider_con_item']//h3[@class='f-fc3']")[z].text
                            z += 1
                            cTT.append(cTeam)
                        except:
                            break
                    ans=",".join(cTT)
                    print(ans)
                    cCount=self.driver.find_element_by_xpath("//span[@class='course-enroll-info_course-enroll_price-enroll_enroll-count']").text
                    print(cCount)
                    cProcess=self.driver.find_element_by_xpath("//div[@class='course-enroll-info_course-info_term-info_term-time']//span[position()=2]").text
                    print(cProcess)
                    cBrief = self.driver.find_element_by_xpath("//div[@id='j-rectxt2']").text
                    print(cBrief)
                    self.driver.close()
                    dd=self.driver.window_handles[-1]
                    self.driver.switch_to.window(dd)
                except Exception as err:
                    print(err)
                self.driver.close()
                old_window=self.driver.window_handles[0]
                self.driver.switch_to.window(old_window)
                self.No = self.No + 1
                no = str(self.No)
                while len(no) < 3:
                    no = "0" + no
                print(no)
                self.insertDB(no, cCource, cCollege, cTeacher, ans, cCount, cProcess, cBrief)
        except Exception as err:
            print(err)
            print("err10")


    def executeSpider(self, url):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url)
        print("Spider processing......")
        self.load_mooc()
        print("loading closing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")


url = "https://www.icourse163.org/channel/2001.htm"
spider = MySpider()
while True:
    print("1.爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择(1,2,3):")
    if s == "1":
        spider.executeSpider(url)
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

数据库：

确实只有这六门课
gif 动态图：

随便找了个格式转换，不过应该能勉强看清

2）、心得体会

其实和上次基本相同，登录的时候需要转换iframe。
就是需要点进个人中心，再点进你学完的课程的主页面。
这次有会跳转出两个窗口，所以需要在爬完页面后关掉两个窗口，并且回到主页面。

posted on 2020-11-25 16:21 durrican 阅读(203) 评论(0) 收藏举报