91、Beego框架之爬虫项目——2020年08月02日19:57:16

91、Beego框架之爬虫项目

2020年08月02日15:21:32

1、建立数据库

movie.sql

CREATE TABLE `movie_info` (
  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  `movie_id` int(11) unsigned NOT NULL COMMENT '电影id',
  `movie_name` varchar(100) COMMENT '电影名称',
  `movie_pic` varchar(200) COMMENT '电影图片',
  `movie_director` varchar(50) COMMENT '电影导演',
  `movie_writer` varchar(50) COMMENT '电影编剧',
  `movie_country` varchar(50) COMMENT '电影产地',
  `movie_language` varchar(50) COMMENT '电影语言',
  `movie_main_character` varchar(50) COMMENT '电影主演',
  `movie_type` varchar(50) COMMENT '电影类型',
  `movie_on_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '电影上映时间',
  `movie_span` varchar(20) COMMENT '电影时长',
  `movie_grade` varchar(5) COMMENT '电影评分',
  `remark` varchar(500) DEFAULT '' COMMENT '备注',
  `_create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '创建时间',
  `_modify_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间',
  `_status` tinyint(1) DEFAULT '1',
  PRIMARY KEY (`id`),
  KEY `idx_movie_id` (`movie_id`),
  KEY `idx_create_time` (`_create_time`),
  KEY `idx_modify_time` (`_modify_time`)
) ENGINE=InnoDB AUTO_INCREMENT=20 DEFAULT CHARSET=utf8 COMMENT='电影信息表';

image-20200802155039854

2、控制层调用方法——网页源代码写死

func (c *CrawlMovieController) CrawlMovie() {	
	sMovieHtml := `dasd`
	
  //导演
	c.Ctx.WriteString(models.GetMovieDirector(sMovieHtml) + "|")
  //电影名字
	c.Ctx.WriteString(models.GetMovieName(sMovieHtml) + "|")
  //主演
	c.Ctx.WriteString(models.GetMovieMainCharacters(sMovieHtml) + "|")
  c.Ctx.WriteString(models.GetMovieGrade(sMovieHtml) + "|")
  c.Ctx.WriteString(models.GetMovieGenre(sMovieHtml) + "|")
  c.Ctx.WriteString(models.GetMovieOnTime(sMovieHtml) + "|")
  c.Ctx.WriteString(models.GetMovieRunningTime(sMovieHtml) + "|")
  }

路由层

package routers

import (
	"crawl_movie/controllers"
	"github.com/astaxie/beego"
)

func init() {
    beego.Router("/", &controllers.MainController{})
    beego.Router("/crawl_movie", &controllers.CrawlMovieController{}, "*:CrawlMovie")
}

3、models 获取数据

定义Movieinfo数据

type MovieInfo struct{ 
  Id int64
  Movie_id int64
  Movie_name string
  Movie_pic string
  Movie_director string
  Movie_writer string
  Movie_country string
  Movie_language string
  Movie_main_character string
  Movie_type string
  Movie_on_time string
  Movie_span string
  Movie_grade string
  _Create_time string
}

初始化

import (
	_ "github.com/go-sql-driver/mysql"
	"github.com/astaxie/beego/orm"
	"regexp"
)

var (
	db orm.Ormer
)

func init() {
	orm.Debug = true // 是否开启调试模式 调试模式下会打印出sql语句
	orm.RegisterDataBase("default", "mysql", "root:123@tcp(127.0.0.1:3306)/test?charset=utf8", 30)
	orm.RegisterModel(new(MovieInfo))
	db = orm.NewOrm()
}

获取导演字段

func GetMovieDirector(movieHtml string) string{
	if movieHtml == ""{
		return ""
	}


	reg := regexp.MustCompile(`<a.*?rel="v:directedBy">(.*?)</a>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	return string(result[0][1])
}
func GetMovieName(movieHtml string)string{
	if movieHtml == ""{
		return ""
	}

	reg := regexp.MustCompile(`<span\s*property="v:itemreviewed">(.*?)</span>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	return string(result[0][1])
}
func GetMovieGrade(movieHtml string)string{
	reg := regexp.MustCompile(`<strong.*?property="v:average">(.*?)</strong>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	return string(result[0][1])
}



func GetMovieOnTime(movieHtml string) string{
	reg := regexp.MustCompile(`<span.*?property="v:initialReleaseDate".*?>(.*?)</span>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	return string(result[0][1])
}

func GetMovieRunningTime(movieHtml string) string{
	reg := regexp.MustCompile(`<span.*?property="v:runtime".*?>(.*?)</span>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	return string(result[0][1])
}

获取到多个主演

func GetMovieMainCharacters(movieHtml string)string{
	reg := regexp.MustCompile(`<a.*?rel="v:starring">(.*?)</a>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	mainCharacters := ""
	for _,v := range result{
		mainCharacters += v[1] + "/"
	}

	return mainCharacters
}


func GetMovieGenre(movieHtml string)string{
	reg := regexp.MustCompile(`<span.*?property="v:genre">(.*?)</span>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	movieGenre := ""
	for _,v := range result{
		movieGenre += v[1] + "/"
	}
	return movieGenre
}

4、通过url获取源代码内容

  sUrl := "https://movie.douban.com/subject/25827935/"
  //sUrl = "https://movie.douban.com/subject/6786002/?from=subject-page"

  rsp := httplib.Get(sUrl)
  sMovieHtml,err := rsp.String()
  if err != nil{
      panic(err)
  }

5、数据存在数据库里

  var movieInfo models.MovieInfo

  movieInfo.Movie_name            = models.GetMovieName(sMovieHtml)
  movieInfo.Movie_director        = models.GetMovieDirector(sMovieHtml)
  movieInfo.Movie_main_character  = models.GetMovieMainCharacters(sMovieHtml) 
  movieInfo.Movie_type            = models.GetMovieGenre(sMovieHtml)
  movieInfo.Movie_on_time         = models.GetMovieOnTime(sMovieHtml)
  movieInfo.Movie_grade           = models.GetMovieGrade(sMovieHtml)
  movieInfo.Movie_span            = models.GetMovieRunningTime(sMovieHtml)

  id, _ := models.AddMovie(&movieInfo)
  c.Ctx.WriteString(fmt.Sprintf("%v", id))
  
//movie_info.go

func AddMovie(movie_info *MovieInfo)(int64,error){
	id,err := db.Insert(movie_info)
	return id,err
}

6、找到当前网页的其他电影链接

func GetMovieUrls(movieHtml string)[]string{
  
	reg := regexp.MustCompile(`<a.*?href="(https://movie.douban.com/.*?)"`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	var movieSets []string
	for _,v := range result{
		movieSets = append(movieSets, v[1])
	}

	return movieSets
}

7、使用redis

 ✘ ⚙  ~  redis-server
 
78176:C 02 Aug 2020 18:27:30.697 # oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo
78176:C 02 Aug 2020 18:27:30.699 # Redis version=5.0.5, bits=64, commit=00000000, modified=0, pid=78176, just started
78176:C 02 Aug 2020 18:27:30.699 # Warning: no config file specified, using the default config. In order to specify a config file use redis-server /path/to/redis.conf
78176:M 02 Aug 2020 18:27:30.702 * Increased maximum number of open files to 10032 (it was originally set to 256).
                _._
           _.-``__ ''-._
      _.-``    `.  `_.  ''-._           Redis 5.0.5 (00000000/0) 64 bit
  .-`` .-```.  ```\/    _.,_ ''-._
 (    '      ,       .-`  | `,    )     Running in standalone mode
 |`-._`-...-` __...-.``-._|'` _.-'|     Port: 6379
 |    `-._   `._    /     _.-'    |     PID: 78176
  `-._    `-._  `-./  _.-'    _.-'
 |`-._`-._    `-.__.-'    _.-'_.-'|
 |    `-._`-._        _.-'_.-'    |           http://redis.io
  `-._    `-._`-.__.-'_.-'    _.-'
 |`-._`-._    `-.__.-'    _.-'_.-'|
 |    `-._`-._        _.-'_.-'    |
  `-._    `-._`-.__.-'_.-'    _.-'
      `-._    `-.__.-'    _.-'
          `-._        _.-'
              `-.__.-'

78176:M 02 Aug 2020 18:27:30.724 # Server initialized
78176:M 02 Aug 2020 18:27:30.724 * Ready to accept connections
 ✘  ~  redis-cli
 
127.0.0.1:6379> keys *
1) "url_queue"

127.0.0.1:6379> LRANGE url_queue 0 -1

 1) "https://movie.douban.com/feed/subject/25827935/reviews"
 2) "https://movie.douban.com/subject/25827935/wishes"
 3) "https://movie.douban.com/subject/25827935/collections"
 4) "https://movie.douban.com/subject/25827935/doulists"
 5) "https://movie.douban.com/subject/25827935/questions/727598/?from=subject"
 6) "https://movie.douban.com/subject/25827935/questions/727693/?from=subject"
 7) "https://movie.douban.com/subject/25827935/discussion/616357468/"
 8) "https://movie.douban.com/subject/25827935/discussion/614770818/"
 9) "https://movie.douban.com/subject/25827935/discussion/616710371/"
10) "https://movie.douban.com/subject/25827935/discussion/616710732/"
11) "https://movie.douban.com/subject/25827935/discussion/616717788/"
12) "https://movie.douban.com/review/8085061/#comments"
13) "https://movie.douban.com/review/8085061/"
14) "https://movie.douban.com/review/8096152/#comments"
15) "https://movie.douban.com/review/8096152/"
16) "https://movie.douban.com/review/8090401/#comments"
17) "https://movie.douban.com/review/8090401/"
18) "https://movie.douban.com/review/8087492/#comments"
19) "https://movie.douban.com/review/8087492/"
20) "https://movie.douban.com/review/8092536/#comments"
21) "https://movie.douban.com/review/8092536/"
22) "https://movie.douban.com/review/8087571/#comments"
23) "https://movie.douban.com/review/8087571/"
24) "https://movie.douban.com/review/8100255/#comments"
25) "https://movie.douban.com/review/8100255/"
26) "https://movie.douban.com/review/8083968/#comments"
27) "https://movie.douban.com/review/8083968/"
28) "https://movie.douban.com/review/8085143/#comments"
29) "https://movie.douban.com/review/8085143/"
30) "https://movie.douban.com/review/8076701/#comments"
31) "https://movie.douban.com/review/8076701/"
32) "https://movie.douban.com/subject/25827935/comments?status=P"
33) "https://movie.douban.com/subject/27024903/?from=subject-page"
34) "https://movie.douban.com/subject/27024903/?from=subject-page"
35) "https://movie.douban.com/subject/26862829/?from=subject-page"
36) "https://movie.douban.com/subject/26862829/?from=subject-page"
37) "https://movie.douban.com/subject/11529526/?from=subject-page"
38) "https://movie.douban.com/subject/11529526/?from=subject-page"
39) "https://movie.douban.com/subject/6874741/?from=subject-page"
40) "https://movie.douban.com/subject/6874741/?from=subject-page"
41) "https://movie.douban.com/subject/25716096/?from=subject-page"
42) "https://movie.douban.com/subject/25716096/?from=subject-page"
43) "https://movie.douban.com/subject/4739952/?from=subject-page"
44) "https://movie.douban.com/subject/4739952/?from=subject-page"
45) "https://movie.douban.com/subject/4920528/?from=subject-page"
46) "https://movie.douban.com/subject/4920528/?from=subject-page"
47) "https://movie.douban.com/subject/30166972/?from=subject-page"
48) "https://movie.douban.com/subject/30166972/?from=subject-page"
49) "https://movie.douban.com/subject/26366465/?from=subject-page"
50) "https://movie.douban.com/subject/26366465/?from=subject-page"
51) "https://movie.douban.com/subject/3319755/?from=subject-page"
52) "https://movie.douban.com/subject/3319755/?from=subject-page"
53) "https://movie.douban.com/awards/golden-rooster/31/"
54) "https://movie.douban.com/awards/hkfaa/36/"
55) "https://movie.douban.com/awards/goldenhorse/53/"
56) "https://movie.douban.com/subject/25827935/awards/"
57) "https://movie.douban.com/photos/photo/2372679263/"
58) "https://movie.douban.com/photos/photo/2374163695/"
59) "https://movie.douban.com/video/100597/"
60) "https://movie.douban.com/trailer/203039/#content"
61) "https://movie.douban.com/subject/25827935/mupload"
62) "https://movie.douban.com/subject/25827935/all_photos"
63) "https://movie.douban.com/subject/25827935/trailer#short_video"
64) "https://movie.douban.com/subject/25827935/trailer#trailer"
65) "https://movie.douban.com/celebrity/1365506/"
66) "https://movie.douban.com/celebrity/1365506/"
67) "https://movie.douban.com/celebrity/1328349/"
68) "https://movie.douban.com/celebrity/1328349/"
69) "https://movie.douban.com/celebrity/1349387/"
70) "https://movie.douban.com/celebrity/1349387/"
71) "https://movie.douban.com/celebrity/1275243/"
72) "https://movie.douban.com/celebrity/1275243/"
73) "https://movie.douban.com/celebrity/1274224/"
74) "https://movie.douban.com/celebrity/1274224/"
75) "https://movie.douban.com/celebrity/1274534/"
76) "https://movie.douban.com/celebrity/1274534/"
77) "https://movie.douban.com/help/movie#t0-qs"
78) "https://movie.douban.com/subject/25827935/photos?type=R"
79) "https://movie.douban.com/annual/2019?source=movie_navigation"
80) "https://movie.douban.com/annual/2019?source=navigation"
81) "https://movie.douban.com/review/best/"
82) "https://movie.douban.com/tag/"
83) "https://movie.douban.com/chart"
84) "https://movie.douban.com/tv/"
85) "https://movie.douban.com/explore"
86) "https://movie.douban.com/cinema/nowplaying/"
//redis.go

package models

import (
	"github.com/astaxie/goredis"
)

const (
	URL_QUEUE = "url_queue"
	URL_VISIT_SET = "url_visit_set"
)

var (
	client goredis.Client
)

func ConnectRedis(addr string){
	client.Addr = addr
}

func PutinQueue(url string){
	client.Lpush(URL_QUEUE, []byte(url))
}

func PopfromQueue() string{
	res,err := client.Rpop(URL_QUEUE)
	if err != nil{
		panic(err)
	}

	return string(res)
}

func GetQueueLength() int{
	length,err := client.Llen(URL_QUEUE)
	if err != nil{
		return 0
	}

	return length
}

func AddToSet(url string){
	client.Sadd(URL_VISIT_SET, []byte(url))
}

func IsVisit(url string) bool{
	bIsVisit, err := client.Sismember(URL_VISIT_SET, []byte(url))
	if err != nil{
		return false
	}

	return bIsVisit
}


8、递归查找链接

//movie_info.go

package models

import (
	_ "github.com/go-sql-driver/mysql"
	"github.com/astaxie/beego/orm"
	"regexp"
	"strings"
)

var (
	db orm.Ormer
)

type MovieInfo struct{ 
  Id int64
  Movie_id int64
  Movie_name string
  Movie_pic string
  Movie_director string
  Movie_writer string
  Movie_country string
  Movie_language string
  Movie_main_character string
  Movie_type string
  Movie_on_time string
  Movie_span string
  Movie_grade string
}

func init() {
	orm.Debug = true // 是否开启调试模式 调试模式下会打印出sql语句
	orm.RegisterDataBase("default", "mysql", "root:123@tcp(127.0.0.1:3306)/test?charset=utf8", 30)
	orm.RegisterModel(new(MovieInfo))
	db = orm.NewOrm()
}

func AddMovie(movie_info *MovieInfo)(int64,error){
	movie_info.Id = 0
	id,err := db.Insert(movie_info)
	return id,err
}

func GetMovieDirector(movieHtml string) string{
	if movieHtml == ""{
		return ""
	}


	reg := regexp.MustCompile(`<a.*?rel="v:directedBy">(.*?)</a>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	if len(result) == 0{
		return ""
	}

	return string(result[0][1])
}

func GetMovieName(movieHtml string)string{
	if movieHtml == ""{
		return ""
	}

	reg := regexp.MustCompile(`<span\s*property="v:itemreviewed">(.*?)</span>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	if len(result) == 0{
		return ""
	}

	return string(result[0][1])
}

func GetMovieMainCharacters(movieHtml string)string{
	reg := regexp.MustCompile(`<a.*?rel="v:starring">(.*?)</a>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	if len(result) == 0{
		return ""
	}

	mainCharacters := ""
	for _,v := range result{
		mainCharacters += v[1] + "/"
	}

	return strings.Trim(mainCharacters, "/")
}

func GetMovieGrade(movieHtml string)string{
	reg := regexp.MustCompile(`<strong.*?property="v:average">(.*?)</strong>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	if len(result) == 0{
		return ""
	}
	return string(result[0][1])
}

func GetMovieGenre(movieHtml string)string{
	reg := regexp.MustCompile(`<span.*?property="v:genre">(.*?)</span>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	if len(result) == 0{
		return ""
	}

	movieGenre := ""
	for _,v := range result{
		movieGenre += v[1] + "/"
	}
	return strings.Trim(movieGenre, "/")
}

func GetMovieOnTime(movieHtml string) string{
	reg := regexp.MustCompile(`<span.*?property="v:initialReleaseDate".*?>(.*?)</span>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	if len(result) == 0{
		return ""
	}

	return string(result[0][1])
}

func GetMovieRunningTime(movieHtml string) string{
	reg := regexp.MustCompile(`<span.*?property="v:runtime".*?>(.*?)</span>`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	if len(result) == 0{
		return ""
	}

	return string(result[0][1])
}


func GetMovieUrls(movieHtml string)[]string{
	reg := regexp.MustCompile(`<a.*?href="(https://movie.douban.com/.*?)"`)
	result := reg.FindAllStringSubmatch(movieHtml, -1)

	var movieSets []string
	for _,v := range result{
		movieSets = append(movieSets, v[1])
	}

	return movieSets
}
//crawlMovie.go

package controllers

import (
	"crawl_movie/models"
	"github.com/astaxie/beego"
  "github.com/astaxie/beego/httplib"
  "time"
)

type CrawlMovieController struct {
	beego.Controller
}

/**
 目前这个爬虫只能爬取静态数据 对于像京东的部分动态数据 无法爬取
 对于动态数据 可以采用 一个组件 phantomjs
*/

func (c *CrawlMovieController) CrawlMovie() {
  var movieInfo models.MovieInfo
  //连接到redis
  models.ConnectRedis("127.0.0.1:6379")
    
  //爬虫入口url
  sUrl := "https://movie.douban.com/subject/25827935/"
  models.PutinQueue(sUrl)

  for{
        length := models.GetQueueLength()
        if length == 0{
            break //如果url队列为空 则退出当前循环
        }

        sUrl = models.PopfromQueue()
        //我们应当判断sUrl是否应该被访问过
        if models.IsVisit(sUrl){
           continue
        }

        rsp := httplib.Get(sUrl)
        //设置User-agent以及cookie是为了防止  豆瓣网的 403
        rsp.Header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
        rsp.Header("Cookie", `bid=gFP9qSgGTfA; __utma=30149280.1124851270.1482153600.1483055851.1483064193.8; __utmz=30149280.1482971588.4.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118221"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1483064193%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=5afcf5e5496eab22.1482413017.7.1483066280.1483057909.; __utma=223695111.1636117731.1482413017.1483055857.1483064193.7; __utmz=223695111.1483055857.6.5.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _vwo_uuid_v2=BDC2DBEDF8958EC838F9D9394CC5D9A0|2cc6ef7952be8c2d5408cb7c8cce2684; ap=1; viewed="1006073"; gr_user_id=e5c932fc-2af6-4861-8a4f-5d696f34570b; __utmc=30149280; __utmc=223695111; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1483064193; __utmb=223695111.0.10.1483064193`)
    
        sMovieHtml,err := rsp.String()
        if err != nil{
            panic(err)
        }

        movieInfo.Movie_name            = models.GetMovieName(sMovieHtml)
        //记录电影信息
        if movieInfo.Movie_name != ""{
            movieInfo.Movie_director        = models.GetMovieDirector(sMovieHtml)
            movieInfo.Movie_main_character  = models.GetMovieMainCharacters(sMovieHtml) 
            movieInfo.Movie_type            = models.GetMovieGenre(sMovieHtml)
            movieInfo.Movie_on_time         = models.GetMovieOnTime(sMovieHtml)
            movieInfo.Movie_grade           = models.GetMovieGrade(sMovieHtml)
            movieInfo.Movie_span            = models.GetMovieRunningTime(sMovieHtml)
            
            models.AddMovie(&movieInfo)
        }

        //提取该页面的所有连接
        urls := models.GetMovieUrls(sMovieHtml)

        for _,url := range urls{
            models.PutinQueue(url)
            c.Ctx.WriteString("<br>" + url + "</br>")
        }

        //sUrl 应当记录到 访问set中
        models.AddToSet(sUrl)

        time.Sleep(time.Second)
    }

    c.Ctx.WriteString("end of crawl!")
}

END

2020年08月02日19:31:37

2020年08月02日19:56:44

posted @ 2020-08-02 19:58  一颗小苹果  阅读(720)  评论(0编辑  收藏  举报