path <- "C:/Users/lujian/Desktop/input1" ##文件目录
fileNames <- dir(path)[1:366] ##获取该路径下的文件名
filePath <- sapply(fileNames, function(x){
paste(path,x,sep='/')}) ##生成读取文件路径
##读取数据,结果为list
data <- sapply(filePath, function(x){
read.csv(x, header=FALSE, stringsAsFactors = FALSE)})
## 计算每天帖子矩阵列数
everyday.ncol <- c(length=366)
for(i in 1:366){
everyday.ncol[i] <- ncol(data[[i]])
}
## 选出每天一条帖子评论数评论数大于5000,51天符合要求
table(everyday.ncol-6 > 5000)
useful.day <- which(everyday.ncol-6 > 5000)
index2012 <- array(dim=c(length(useful.day), 2))
for(i in useful.day){
index2012[which(i == useful.day), ] <-
c(i, which(data[[i]][ , 5006] != "")[1])
} # 某些天不止一条帖子评论数超过5000,仅选取那天第一条
## 摘出51条帖子数据放入数据框
df2012 <- data.frame(matrix(NA, 51, 16384)) # EXCEL最多显示16384列,也只能读取这
for(i in index2012[ , 1]){
which.row <- which(index2012[ , 1] == i)
df2012[which.row, ] <- data[[i]][index2012[which.row, 2], ]
# 将循环填充的数据删去
true.ncol <- ncol(data[[i]][index2012[which.row, 2], ])
if(true.ncol < 16384){
df2012[which.row, (true.ncol+1):16384] <- NA
}
}
## 第50行时间格式与其他行不同,不做复杂转换处理,直接手动删去
df2012 <- df2012[-50, ]
## 时间格式数据转化为每小时评论数(前8小时每小时累计)
comment2012 <- matrix(nrow=nrow(df2012), ncol=8)
for(i in 1:nrow(df2012)){
initial.time <- strptime(df2012[i, 6], "%Y/%m/%d %H:%M")
count <- c(length=8)
for(j in 1:8){
number <- 0
reply.time <- initial.time + j*3600 #从首发帖时间j小时后
for (k in 7:ncol(df2012)){
if ((strptime(df2012[i, k], "%Y/%m/%d %H:%M") <=
reply.time) & !is.na(df2012[i, k])){
number <- number+1
}else{
break
}
}
count[j] <- number
}
comment2012[i, ] <- count
}
## 差分得出每小时评论数
for(i in 8:2){
comment2012[ , i] <- comment2012[ , i]-comment2012[ , i-1]
}
## Pearson拟合优度卡方检验是否为泊松分布
table(comment2012[ , 1])
X <- as.numeric(names(table(comment2012[ , 1])))
Y <- as.numeric(table(comment2012[ , 1]))
q <- ppois(X, mean(rep(X,Y)))
p <- numeric(length(Y))
p[1] <- q[1]; p[n] <- 1-q[n-1]
for (i in 2:(n-1)){
p[i]<-q[i]-q[i-1]
}
chisq.test(Y,p=p) # p小于0.05,拒绝原假设,不是泊松分布
# # 上述分组每组频数小于5,导致近似算法不准
## K-S检验是否为泊松分布
mean(comment2012[ , 8])
ks.test(jitter(comment2012[ , 1]), "ppois", 10)
# p小于0.05,拒绝原假设,不是泊松分布
# ks.test(jitter(comment2012[ , 8]/8), "ppois", 10)
ks.test(jitter(comment2012[ , 8]), "ppois", 86)
## 随机数产生参数为25的泊松分布
set.seed(100)
z1 <- rpois(50, 25)
hist(z1)
## 随机数产生参数为10的泊松分布
set.seed(100)
z2 <- rpois(50, 10)
hist(z2)
## 查看50帖子8小时评论数图形
# range(comment2012[ , 8])
## 画频数分布直方图
hist(comment2012[ , 8], freq = F, breaks = 20)
## 再画概率分布曲线
lines(density(comment2012[ , 8], bw=10), col="red", lwd=2)
## 查看50帖子1小时评论数图形
# range(comment2012[ , 1])
hist(comment2012[ , 1], freq = F, breaks = 30)
lines(density(comment2012[ , 1], bw=1), col="red", lwd=2)
## 查看50条帖子时间点分布
dim(df2012)
days <- strptime(df2012[ , 2006], "%Y/%m/%d %H:%M") -
strptime(df2012[ , 6], "%Y/%m/%d %H:%M")
days/24
sort(days)/24
## 以下用的comment2012是未差分过的!!!
## 第2条帖子
df2012[2, 6]
initial.time0109 <- strptime(df2012[2, 6], "%Y/%m/%d %H:%M")
count0109 <- c(length=76)
for(j in 1:76){
number <- 0
reply.time0109 <- initial.time0109 + j*3600 #从首发帖时间j小时后
for (k in 7:2006){
if ((strptime(df2012[2, k], "%Y/%m/%d %H:%M") <=
reply.time0109) & !is.na(df2012[2, k])){
number <- number+1
}else{
break
}
}
count0109[j] <- number
}
for(i in 76:2){
count0109[i] <- count0109[i] - count0109[i-1]
}
plot(count0109, type = "l")
## 第5条帖子
df2012[5, 6]
days[5]
initial.time0121 <- strptime(df2012[5, 6], "%Y/%m/%d %H:%M")
count0121 <- c(length=87)
for(j in 1:87){
number <- 0
reply.time0121 <- initial.time0121 + j*3600 #从首发帖时间j小时后
for (k in 7:2006){
if ((strptime(df2012[5, k], "%Y/%m/%d %H:%M") <=
reply.time0121) & !is.na(df2012[5, k])){
number <- number+1
}else{
break
}
}
count0121[j] <- number
}
for(i in 87:2){
count0121[i] <- count0121[i] - count0121[i-1]
}
plot(count0121, type = "l")
## 第39条帖子
df2012[39, 6]
days[39]
initial.time0601 <- strptime(df2012[39, 6], "%Y/%m/%d %H:%M")
count0601 <- c(length=42)
for(j in 1:42){
number <- 0
reply.time0601 <- initial.time0601 + j*3600 #从首发帖时间j小时后
for (k in 7:2006){
if ((strptime(df2012[39, k], "%Y/%m/%d %H:%M") <=
reply.time0601) & !is.na(df2012[39, k])){
number <- number+1
}else{
break
}
}
count0601[j] <- number
}
for(i in 42:2){
count0601[i] <- count0601[i] - count0601[i-1]
}
plot(count0601, type = "l")
## 拟合伽马分布
## 第2个帖子第一小时评论时间间隔gamma分布拟合
comment2012[5, 1]
df2012[5, 6]
post20120121 <- strptime(df2012[5, 6:29], "%Y/%m/%d %H:%M")
time.interval <- c(length=23)
for(i in 2:length(post20120121)){
time.interval[i-1] <- post20120121[i]-post20120121[i-1]
}
# hist(time.interval, freq = F, breaks = 23)
# lines(density(time.interval, bw=0.5), col="red", lwd=2)
## ks检验是否符合gamma分布
a20120121 <- mean(time.interval)^2/var(time.interval)
b20120121 <- mean(time.interval)/var(time.interval)
ks.test(jitter(time.interval), "pgamma", a20120121, b20120121)
## p值大于0.05,不能拒绝原假设,即符合gamma分布
## 第39个帖子第一小时评论时间间隔gamma分布拟合
comment2012[39, 1]
df2012[39, 6]
post20120601 <- strptime(df2012[39, 6:15], "%Y/%m/%d %H:%M")
time.interval2 <- c(length=9)
for(i in 2:length(post20120601)){
time.interval2[i-1] <- post20120601[i]-post20120601[i-1]
}
## ks检验是否符合gamma分布
a20120601 <- mean(time.interval2)^2/var(time.interval2)
b20120601 <- mean(time.interval2)/var(time.interval2)
ks.test(jitter(time.interval2), "pgamma", a20120601, b20120601)
## p值大于0.05,不能拒绝原假设,即符合gamma分布
## gamma分布画图
plot(sort(time.interval),
dgamma(sort(time.interval), a20120121, a20120121),
main="the Gamma Density Distribution",
col = "red", type="l", lwd=2)
lines(sort(time.interval2),
dgamma(sort(time.interval2), a20120601, a20120601),
col="blue",lwd=2)
x = seq(0,6,length.out=100)
y = dgamma(x,2,1)
plot(x, y, main="the Gamma Density Distribution",
xlim = c(0,6),ylim = c(0,1) ,col = "red" ,
type="l",lwd=2)