R-大数据分析挖掘(4-R爬虫实现)
library("XML")
#获取全部的链接
url <- 'http://www.csdn.net/tag/'
i_url_parse<-htmlParse(url,encoding="UTF-8")
xpath_1 <- "//div[@class='overflow']/a"
node<-getNodeSet(i_url_parse,xpath_1)
for(j in 1:length(node)){
value2<-xmlGetAttr(node[[j]],name='href')
print(value2)
}
handler <- function(url){
xpath_1 <- "//div[@class='line_list']/*"
i_url_parse<-htmlParse(url,encoding="UTF-8");
node<-getNodeSet(i_url_parse,xpath_1)
#get all div
for(j in 1:length(node)){
value <- xmlGetAttr(node[[j]],name='class');
if(value == 'tit_list'){
tital <- xmlValue(node[[j]])
}else if (value == 'dwon_words'){
xpath_2 <- "//div[@class='dwon_words']/span[@class='tag_source']/a"
node1 <-getNodeSet(i_url_parse,xpath_2)
for(k in 1: length(node1)){
value <- xmlGetAttr(node1[[k]],name='href');
text <- xmlValue(node1[[k]]);
}
}
}
xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']"
node3 <-getNodeSet(i_url_parse,xpath_3)
for(m in 1: length(node3)){
value1 <- xmlValue(node3[[m]])
if(value1 == '下一页'){
next_url = xmlGetAttr(node3[[m]],name='href');
next_url <- paste('http://www.csdn.net',next_url,sep="")
print(next_url)
Sys.sleep(25)
handler(next_url)
}
}
}
#处理news
url <- 'http://www.csdn.net/tag/android/news'
xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']"
node3 <-getNodeSet(i_url_parse,xpath_3)
for(m in 1: length(node3)){
value1 <- xmlValue(node3[[m]])
if(value1 == '下一页'){
next_url = xmlGetAttr(node3[[m]],name='href');
print(next_url)
handler(next_url)
}
}
#获取全部的链接
url <- 'http://www.csdn.net/tag/'
i_url_parse<-htmlParse(url,encoding="UTF-8")
xpath_1 <- "//div[@class='overflow']/a"
node<-getNodeSet(i_url_parse,xpath_1)
for(j in 1:length(node)){
value2<-xmlGetAttr(node[[j]],name='href')
print(value2)
}
结果:
R中处理函数:
handler <- function(url){
xpath_1 <- "//div[@class='line_list']/*"
i_url_parse<-htmlParse(url,encoding="UTF-8");
node<-getNodeSet(i_url_parse,xpath_1)
#get all div
for(j in 1:length(node)){
value <- xmlGetAttr(node[[j]],name='class');
if(value == 'tit_list'){
tital <- xmlValue(node[[j]])
}else if (value == 'dwon_words'){
xpath_2 <- "//div[@class='dwon_words']/span[@class='tag_source']/a"
node1 <-getNodeSet(i_url_parse,xpath_2)
for(k in 1: length(node1)){
value <- xmlGetAttr(node1[[k]],name='href');
text <- xmlValue(node1[[k]]);
}
}
}
递归调用handler函数:
xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']"
node3 <-getNodeSet(i_url_parse,xpath_3)
for(m in 1: length(node3)){
value1 <- xmlValue(node3[[m]])
if(value1 == '下一页'){
next_url = xmlGetAttr(node3[[m]],name='href');
next_url <- paste('http://www.csdn.net',next_url,sep="")
print(next_url)
Sys.sleep(25)
handler(next_url)
}
}
单独处理新闻的函数:
#处理news
url <- 'http://www.csdn.net/tag/android/news'
xpath_3<-"//div[@class='page_nav']/a[@class='pageliststy']"
node3 <-getNodeSet(i_url_parse,xpath_3)
for(m in 1: length(node3)){
value1 <- xmlValue(node3[[m]])
if(value1 == '下一页'){
next_url = xmlGetAttr(node3[[m]],name='href');
print(next_url)
handler(next_url)
}
}
代码段2:
rm(list=ls())
gc()
library(bitops)
library(RCurl)
library(curl)
library(XML)
alphabet <- c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","#")
tags <- data.frame('字母分类具体标签'=NA)
temp <- data.frame('循环多个标签临时'=NA)
result <- data.frame('文章标题'=NA,'文章来源'=NA,'文章标签'=NA,'文章发布者'=NA,'字母分类'=NA,'字母分类具体标签'=NA)
x <- getURLContent("http://www.csdn.net/tag/",encoding="UTF-8")
url_tag <- htmlParse(x, encoding="UTF-8")
num <- 0
for (i in 1:27){
xpath_tag <- paste("/html/body/div/div/ul/li[",i,"]/div/div/a",sep="")
node_tag <- getNodeSet(url_tag, xpath_tag)
m <- length(node_tag)
print(paste(alphabet[i],"字母分类有",m,"个具体标签!",sep=" "))
if(m==0){
print(paste("没有找到",alphabet[i],"字母分类的具体标签相关节点!~~~",sep=" "))
}else{
for (j in 1:m){
tags[j,] <- xmlValue(node_tag[[j]])
k <- 0
for(k in 1:10000){
url <- paste("http://www.csdn.net/tag/",tags[j,1],"/news-",k,sep="")
y <- getURLContent(url,encoding="UTF-8")
url_news <- htmlParse(y, encoding="UTF-8")
node_not_exists <- getNodeSet(url_news,"//div[@class='not_search']")
if(length(node_not_exists)!=0){
break
}else{
node_news <- getNodeSet(url_news, "//div[@class='line_list']")
n <- length(node_news)
if(n==0){
print(paste("我,",alphabet[i],"字母分类具体标签",tags[j,1],"的资讯不存在!~~~",sep=" "))
}else{
print(paste(alphabet[i],"字母分类的具体标签",tags[j,1],"第",k,"页有",n,"条资讯!",sep=" "))
for(p in 1:10){
num <- num+1
node_title <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/a',sep=""))
node_source <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/div/span[1]/a',sep=""))
node_tags <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/div/span[2]/a',sep=""))
node_author <- getNodeSet(url_news,paste('/html/body/div/div/div[3]/div[2]/ul/li[',p,']/div/div/span[3]/a',sep=""))
result[num,1] <- xmlValue(node_title[[1]])
result[num,2] <- xmlValue(node_source[[1]])
for(q in 1:length(node_tags)){
temp[q,1] <- xmlValue(node_tags[[q]])
}
result[num,3] <- paste(temp[1],sep=",")
result[num,4] <- xmlValue(node_author[[1]])
result[num,5] <- alphabet[i]
result[num,6] <- tags[j,1]
}
if(num <= 10){
write.table(result[1:num,1:6],file=paste(alphabet[i],".r",sep=""),append=TRUE,col.names=TRUE)
}else{
write.table(result[(num-n+1):num,1:6],file=paste(alphabet[i],".r",sep=""),append=TRUE,col.names=FALSE)
}
Sys.sleep(2) #The time interval to suspend execution for, in seconds.
rm(result)
Sys.sleep(1) #The time interval to suspend execution for, in seconds.
gc()
result <- data.frame('文章标题'=NA,'文章来源'=NA,'文章标签'=NA,'文章发布者'=NA,'字母分类'=NA,'字母分类具体标签'=NA)
print(paste(alphabet[i],"字母分类的具体标签",tags[j,1],"第",k,"页的",n,"条资讯已抓取完毕!",sep=" "))
print(paste("字母分类i数字:",i,"字母分类的具体标签j数字:",j,"资讯页码k数字:",k,"资讯条目n数字:",n,"已抓取条数num数字:",num,sep=""))
}
}
}
}
}
}
在GUI中跑的结果:
有问题,扔需要改进
(二)R的多线程
1.安装R多线程的包
2.载入程序包
3.指定cpu核数
4.调用log函数并赋值给result
5.停止集群
注:
(三)R语言代理设置
curl<-‐getCurlHandle(proxy="10.10.10.10:8080");
getURL("hBp://baidu.com",curl=curl)
博客地址:http://www.cnblogs.com/jackchen-Net/

浙公网安备 33010602011771号