总访问量: PV

DataScience && DataMining && BigData

R-大数据分析挖掘(4-R爬虫实现)

 library("XML")

#获取全部的链接 url <- 'http://www.csdn.net/tag/' i_url_parse<-htmlParse(url,encoding="UTF-8") xpath_1 <- "//div[@class='overflow']/a" node<-getNodeSet(i_url_parse,xpath_1) for(j in 1:length(node)){ value2<-xmlGetAttr(node[[j]],name='href') print(value2) } handler <- function(url){ xpath_1 <- "//div[@class='line_list']/*" i_url_parse<-htmlParse(url,encoding="UTF-8"); node<-getNodeSet(i_url_parse,xpath_1) #get all div for(j in 1:length(node)){ value <- xmlGetAttr(node[[j]],name='class'); if(value == 'tit_list'){ tital <- xmlValue(node[[j]]) }else if (value == 'dwon_words'){ xpath_2 <- "//div[@class='dwon_words']/span[@class='tag_source']/a" node1 <-getNodeSet(i_url_parse,xpath_2) for(k in 1: length(node1)){ value <- xmlGetAttr(node1[[k]],name='href'); text <- xmlValue(node1[[k]]); } }
} xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']" node3 <-getNodeSet(i_url_parse,xpath_3) for(m in 1: length(node3)){ value1 <- xmlValue(node3[[m]]) if(value1 == '下一页'){ next_url = xmlGetAttr(node3[[m]],name='href'); next_url <- paste('http://www.csdn.net',next_url,sep="") print(next_url) Sys.sleep(25) handler(next_url) } } } #处理news url <- 'http://www.csdn.net/tag/android/news' xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']" node3 <-getNodeSet(i_url_parse,xpath_3) for(m in 1: length(node3)){ value1 <- xmlValue(node3[[m]]) if(value1 == '下一页'){ next_url = xmlGetAttr(node3[[m]],name='href'); print(next_url) handler(next_url) } }

 

 

#获取全部的链接
url <- 'http://www.csdn.net/tag/'
i_url_parse<-htmlParse(url,encoding="UTF-8")
xpath_1 <- "//div[@class='overflow']/a"
node<-getNodeSet(i_url_parse,xpath_1)


for(j in 1:length(node)){    
   value2<-xmlGetAttr(node[[j]],name='href')
   print(value2)
}
结果:

R中处理函数:

handler <- function(url){
    xpath_1 <- "//div[@class='line_list']/*"
	i_url_parse<-htmlParse(url,encoding="UTF-8");
    node<-getNodeSet(i_url_parse,xpath_1)
    #get all div
    for(j in 1:length(node)){ 
      value <- xmlGetAttr(node[[j]],name='class');
      if(value == 'tit_list'){
          tital <- xmlValue(node[[j]])
      }else if (value == 'dwon_words'){
          xpath_2 <- "//div[@class='dwon_words']/span[@class='tag_source']/a"
          node1 <-getNodeSet(i_url_parse,xpath_2)
          for(k in 1: length(node1)){
             value <- xmlGetAttr(node1[[k]],name='href');
             text <- xmlValue(node1[[k]]);
      }
    }
}
递归调用handler函数:
  xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']"
    node3 <-getNodeSet(i_url_parse,xpath_3)
    for(m in 1: length(node3)){
       value1 <- xmlValue(node3[[m]])
       if(value1 == '下一页'){
          next_url = xmlGetAttr(node3[[m]],name='href');
            next_url <- paste('http://www.csdn.net',next_url,sep="")
            print(next_url)
            Sys.sleep(25)
            handler(next_url)
       }
    }

单独处理新闻的函数:
#处理news
url <- 'http://www.csdn.net/tag/android/news' 
    xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']"
    node3 <-getNodeSet(i_url_parse,xpath_3)
    for(m in 1: length(node3)){
       value1 <- xmlValue(node3[[m]])
       if(value1 == '下一页'){
          next_url = xmlGetAttr(node3[[m]],name='href');
            print(next_url)
            handler(next_url)
       }
    }
代码段2:
rm(list=ls())
gc()
library(bitops)
library(RCurl)
library(curl)
library(XML)
alphabet <- c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","#")
tags <- data.frame('字母分类具体标签'=NA)
temp <- data.frame('循环多个标签临时'=NA)
result <- data.frame('文章标题'=NA,'文章来源'=NA,'文章标签'=NA,'文章发布者'=NA,'字母分类'=NA,'字母分类具体标签'=NA)
x <- getURLContent("http://www.csdn.net/tag/",encoding="UTF-8")
url_tag <- htmlParse(x, encoding="UTF-8")
num <- 0
for (i in 1:27){
	xpath_tag <- paste("/html/body/div/div/ul/li[",i,"]/div/div/a",sep="")
	node_tag <- getNodeSet(url_tag, xpath_tag)
	m <- length(node_tag)
	print(paste(alphabet[i],"字母分类有",m,"个具体标签!",sep="   "))
	if(m==0){
		print(paste("没有找到",alphabet[i],"字母分类的具体标签相关节点!~~~",sep="   "))
	}else{
		for (j in 1:m){
			tags[j,] <- xmlValue(node_tag[[j]])
			k <- 0
			for(k in 1:10000){
				url <- paste("http://www.csdn.net/tag/",tags[j,1],"/news-",k,sep="")
				y <- getURLContent(url,encoding="UTF-8")
				url_news <- htmlParse(y, encoding="UTF-8")
				node_not_exists <- getNodeSet(url_news,"//div[@class='not_search']")
				if(length(node_not_exists)!=0){
					break
				}else{
					node_news <- getNodeSet(url_news, "//div[@class='line_list']")
					n <- length(node_news)
					if(n==0){
						print(paste("我,",alphabet[i],"字母分类具体标签",tags[j,1],"的资讯不存在!~~~",sep="   "))
					}else{
						print(paste(alphabet[i],"字母分类的具体标签",tags[j,1],"第",k,"页有",n,"条资讯!",sep="   "))
						for(p in 1:10){
							num <- num+1
							node_title <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/a',sep=""))
							node_source <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/div/span[1]/a',sep=""))
							node_tags <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/div/span[2]/a',sep=""))
							node_author <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/div/span[3]/a',sep=""))
							result[num,1] <- xmlValue(node_title[[1]])
							result[num,2] <- xmlValue(node_source[[1]])
							for(q in 1:length(node_tags)){
								temp[q,1] <- xmlValue(node_tags[[q]])
							}
							result[num,3] <- paste(temp[1],sep=",")
							result[num,4] <- xmlValue(node_author[[1]])
							result[num,5] <- alphabet[i]
							result[num,6] <- tags[j,1]
						}
						if(num <= 10){
							write.table(result[1:num,1:6],file=paste(alphabet[i],".r",sep=""),append=TRUE,col.names=TRUE)
						}else{
							write.table(result[(num-n+1):num,1:6],file=paste(alphabet[i],".r",sep=""),append=TRUE,col.names=FALSE)
						}
						Sys.sleep(2) #The time interval to suspend execution for, in seconds.
						rm(result)
						Sys.sleep(1) #The time interval to suspend execution for, in seconds.
						gc()
						result <- data.frame('文章标题'=NA,'文章来源'=NA,'文章标签'=NA,'文章发布者'=NA,'字母分类'=NA,'字母分类具体标签'=NA)
						print(paste(alphabet[i],"字母分类的具体标签",tags[j,1],"第",k,"页的",n,"条资讯已抓取完毕!",sep="   "))
						print(paste("字母分类i数字:",i,"字母分类的具体标签j数字:",j,"资讯页码k数字:",k,"资讯条目n数字:",n,"已抓取条数num数字:",num,sep=""))
					}
				}
			}
		}
	}
}

 在GUI中跑的结果:

有问题,扔需要改进
(二)R的多线程
1.安装R多线程的包

2.载入程序包


3.指定cpu核数


4.调用log函数并赋值给result

5.停止集群

注:

(三)R语言代理设置
curl<-­‐getCurlHandle(proxy="10.10.10.10:8080");
getURL("hBp://baidu.com",curl=curl)
posted @ 2016-01-19 10:44  CJZhaoSimons  阅读(538)  评论(0编辑  收藏  举报