CPage
1 #ifndef _Page_H_030728_ 2 #define _Page_H_030728_ 3 4 #include <string> 5 #include <map> 6 #include <vector> 7 #include <list> 8 #include "Url.h" 9 #include "list.h" 10 #include "uri.h" 11 #include "hlink.h" 12 13 14 //large enough to hold sina's 437 links 15 16 const int ANCHOR_TEXT_LEN = 256; 17 const int MAX_URL_REFERENCES = 1000; 18 const int URL_REFERENCE_LEN = (URL_LEN+ANCHOR_TEXT_LEN)*MAX_URL_REFERENCES*1/2 ; 19 const int MAX_TAG_NUMBERS = 10000; 20 21 using namespace std; 22 23 // plain text or other 24 enum page_type { 25 PLAIN_TEXT, 26 OTHER 27 }; 28 29 struct RefLink4SE // <href src...>, <area src...> 30 { 31 char *link; 32 char *anchor_text; 33 string strCharset; 34 }; 35 36 struct RefLink4History // <img src...>,<script src...> 37 { 38 char *link; 39 }; 40 41 class CPage 42 { 43 public: 44 // url & location 45 string m_sUrl; //网页对应的URL字符串 46 47 // header 48 string m_sHeader;//网页头信息 49 int m_nLenHeader;//网页头信息的长度 50 51 int m_nStatusCode;//状态码 52 int m_nContentLength;;//从网页头信息中提取的网页体的长度,一般不是很准 53 string m_sLocation;//网页的转向信息,可以判断这个网页是否重定向 54 bool m_bConnectionState; //是否支持持续链接Keep-Alive为true否则为false 55 string m_sContentEncoding;//网页体的编码 56 string m_sContentType;//网页体的类型 57 string m_sCharset;//网页体的字符集 58 string m_sTransferEncoding;//网页体的传输编码方式 59 60 // content 61 string m_sContent;//网页体信息 62 int m_nLenContent;//网页体信息的长度 63 string m_sContentNoTags; 64 65 66 // link, in a lash-up state 67 string m_sContentLinkInfo; 68 //从网页体中提取出包含超链接信息的标识,例如<img src="www.baidu.com"/> , 69 //<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area> 70 71 // links for SE, in a lash-up state 72 string m_sLinkInfo4SE; 73 //再从m_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>标识信息 74 int m_nLenLinkInfo4SE;;//m_sLinkInfo4SE的长度 75 76 // links for history archiving, in a lash-up state 77 string m_sLinkInfo4History;//再从m_sContentLinkInfo提取出<img src="www.baidu.com">标识信息 78 int m_nLenLinkInfo4History;//m_sLinkInfo4History的长度 79 80 81 // links for SE, in a good state 82 RefLink4SE m_RefLink4SE[MAX_URL_REFERENCES];//保存URL信息<-->URL的描述信息[这里URL指的是为搜索准备的链接] 即每个网页最多能保存1000个链接 83 int m_nRefLink4SENum;//上面数组的长度 84 85 // links for history archiving, in a good state 86 RefLink4History m_RefLink4History[MAX_URL_REFERENCES/2];//保存URL信息[这个URL指的是为历史网页存档准备的链接] 87 int m_nRefLink4HistoryNum;//上面数组的长度 88 89 //map<string,string,less<string> > m_mapLink4SE; 90 map<string,string> m_mapLink4SE;//保存URL信息<-->URL的描述信息[这里URL指的是为搜索准备的链接] 91 //-----当然了这个map容器的作用主要是删除一个网页中相同的URL 92 vector<string > m_vecLink4History;//保存URL信息--当然了这个vector容器的作用主要是删除一个网页中相同的URL 93 94 // page type 95 enum page_type m_eType;//网页的类型 96 97 // parsed url lists 98 //list<string> m_listLink4SE; 99 100 public: 101 CPage(); 102 CPage(string strUrl, string strLocation, char* header, char* body, int nLenBody); 103 ~CPage(); 104 105 // parse header information from the header content 106 void ParseHeaderInfo(string header);//解析网页头信息 107 108 // parse hyperlinks from the page content 109 bool ParseHyperLinks();//从网页中提取出链接信息 110 111 bool NormalizeUrl(string& strUrl);//判断strUrl是不是正规的url 112 113 bool IsFilterLink(string plink);//判断plink链接是不是要过滤掉 114 115 private: 116 // parse header information from the header content 117 void GetStatusCode(string header);//得到状态码 118 void GetContentLength(string header);//从网页头信息中提取的网页体的长度,一般不是很准 119 void GetConnectionState(string header);//得到连接状态 120 void GetLocation(string header);//得到重定向信息 121 void GetCharset(string header);//得到字符集 122 void GetContentEncoding(string header);//得到网页体编码 123 void GetContentType(string header);//得到网页体类型 124 void GetTransferEncoding(string header);//得到网页体的传输编码方式 125 126 // parse hyperlinks from the web page 127 bool GetContentLinkInfo();//从网页体中提取出包含超链接信息的标识, 128 //例如<img src="www.baidu.com"/> ,<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area> 129 130 131 bool GetLinkInfo4SE();//再从m_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>标识信息 132 bool GetLinkInfo4History();//再从m_sContentLinkInfo提取出<img src="www.baidu.com">标识信息 133 bool FindRefLink4SE();//最终得到为搜索引擎准备的超链接 134 bool FindRefLink4History();//最终得到为历史网页存档准备的超链接 135 136 }; 137 138 #endif /* _Page_H_030728_ */
1 /*Page handling 2 */ 3 4 #include <iostream> 5 #include <string> 6 #include <cstring> 7 #include <map> 8 #include <vector> 9 #include <iterator> 10 #include "Url.h" 11 #include "Page.h" 12 #include "StrFun.h" 13 14 15 //带参构造函数 16 CPage::CPage() 17 { 18 //初始化成员变量 19 m_nStatusCode = 0; 20 m_nContentLength = 0; 21 m_sLocation = ""; 22 m_bConnectionState = false; 23 m_sContentEncoding = ""; 24 m_sContentType = ""; 25 m_sCharset = ""; 26 m_sTransferEncoding = ""; 27 28 m_sContentLinkInfo = ""; 29 m_sLinkInfo4SE = ""; 30 m_sLinkInfo4History = ""; 31 32 m_sContentNoTags = ""; 33 m_nRefLink4SENum = 0; 34 m_nRefLink4HistoryNum = 0; 35 m_eType = PLAIN_TEXT; 36 37 38 //超链接信息以及超链接的描述信息初始化都为空 39 for(int i=0; i< MAX_URL_REFERENCES; i++ ){ 40 m_RefLink4SE[i].link = NULL; 41 m_RefLink4SE[i].anchor_text = NULL; 42 m_RefLink4SE[i].strCharset = ""; 43 44 if(i < MAX_URL_REFERENCES/2){ 45 m_RefLink4History[i].link = NULL; 46 } 47 } 48 49 } 50 51 CPage::CPage( string strUrl, string strLocation, char* header, char* body, int nLenBody) 52 { 53 //assert( header != NULL ); 54 //assert( body != NULL ); 55 //assert( nLenBody > 0 ); 56 57 // CPage(); 58 m_nStatusCode = 0; 59 m_nContentLength = 0; 60 m_sLocation = ""; 61 m_bConnectionState = false; 62 m_sContentEncoding = ""; 63 m_sContentType = ""; 64 m_sCharset = ""; 65 m_sTransferEncoding = ""; 66 67 m_sContentLinkInfo = ""; 68 m_sLinkInfo4SE = ""; 69 m_sLinkInfo4History = ""; 70 71 m_sContentNoTags = ""; 72 m_nRefLink4SENum = 0; 73 m_nRefLink4HistoryNum = 0; 74 m_eType = PLAIN_TEXT; 75 76 //超链接信息以及超链接的描述信息初始化都为空 77 for(int i=0; i< MAX_URL_REFERENCES; i++ ){ 78 m_RefLink4SE[i].link = NULL; 79 m_RefLink4SE[i].anchor_text = NULL; 80 m_RefLink4SE[i].strCharset = ""; 81 82 if(i < MAX_URL_REFERENCES/2){ 83 m_RefLink4History[i].link = NULL; 84 } 85 } 86 87 //将构造函数传入的参数赋值给成员变量 88 m_sUrl = strUrl;//网页对应的URL 89 m_sLocation = strLocation;//网页重定向的URL,没有重定向则传入为空,否则传入重定向的URL信息 90 m_sHeader = header;//网页的头信息 91 m_nLenHeader = strlen(header);//网页头信息的长度 92 93 m_sContent.assign(body, nLenBody);//网页体信息,用body所指向数组的前nLenBody个字符副本替换m_sContent 94 m_nLenContent = nLenBody;//网页体信息的长度 95 96 } 97 98 CPage::~CPage() 99 { 100 } 101 102 103 //解析网页头信息---调用8个私有的成员函数 104 void CPage::ParseHeaderInfo(string strHeader) 105 { 106 GetStatusCode(strHeader); 107 GetContentLength(strHeader); 108 GetLocation(strHeader); 109 GetConnectionState(strHeader); 110 111 GetCharset(strHeader); 112 113 GetContentEncoding(strHeader); 114 GetContentType(strHeader); 115 GetTransferEncoding(strHeader); 116 } 117 118 //得到状态码 119 void CPage::GetStatusCode(string headerBuf) 120 { 121 //例如: 122 123 //HTTP/1.0 200 OK 200就是状态码 124 CStrFun::Str2Lower( headerBuf, headerBuf.length() ); 125 126 char *charIndex = strstr(headerBuf.c_str(), "http/");//在字符串headerBuf中查找第一出现"http/"的位置 127 if (charIndex == NULL) 128 { 129 m_nStatusCode = -1; 130 return; 131 } 132 //吃掉所有无关的字符 133 while(*charIndex != ' '){ 134 charIndex++; 135 } 136 charIndex++; 137 138 int ret = sscanf(charIndex, "%i", &m_nStatusCode);//格式化字符串输入 139 if (ret != 1) m_nStatusCode = -1; 140 } 141 142 143 144 //从网页头信息中提取的网页体的长度,一般不是很准 145 void CPage::GetContentLength(string headerBuf) 146 { 147 //例如: 148 149 //content-length: 21237 21237就是网页体的长度,这个属性值是服务器返回的,不一定正确 150 CStrFun::Str2Lower( headerBuf, headerBuf.length() ); 151 152 char *charIndex = strstr(headerBuf.c_str(), "content-length"); 153 if (charIndex == NULL) return; 154 155 while(*charIndex != ' '){ 156 charIndex++; 157 } 158 charIndex++; 159 160 int ret = sscanf(charIndex, "%i", &m_nContentLength); 161 if (ret != 1) m_nContentLength = -1; 162 } 163 164 165 //得到重定向信息 166 void CPage::GetLocation(string headerBuf) 167 { 168 string::size_type pre_idx,idx; 169 const string delims("\r\n"); 170 171 string strBuf = headerBuf; 172 CStrFun::Str2Lower( headerBuf, headerBuf.length() ); 173 174 idx = headerBuf.find("location:"); 175 if (idx != string::npos)//若找到 176 { 177 pre_idx = idx + sizeof("location: ") -1; 178 idx = headerBuf.find_first_of(delims, pre_idx );//查找换行符 179 if (idx != string::npos) 180 { 181 //m_sLocation = headerBuf.substr(pre_idx, idx - pre_idx); 182 m_sLocation = strBuf.substr(pre_idx, idx - pre_idx); 183 } 184 } 185 } 186 187 188 //得到网页字符集 189 void CPage::GetCharset(string headerBuf) 190 { 191 string::size_type pre_idx,idx; 192 const string delims(" \",;>"); 193 194 CStrFun::Str2Lower(headerBuf, headerBuf.size()); 195 196 idx = headerBuf.find("charset="); 197 if( idx != string::npos) { 198 m_sCharset = headerBuf.substr(idx + sizeof("charset=") -1);//保存从charset=开始的所有字符串 199 } 200 201 headerBuf = m_sContent; 202 headerBuf = headerBuf.substr(0,2024) ; 203 CStrFun::Str2Lower( headerBuf, headerBuf.length() ); 204 idx = headerBuf.find("charset="); 205 if (idx != string::npos)//后边有可能有多余的信息 206 { 207 pre_idx = idx + sizeof("charset=") -1; 208 idx = headerBuf.find_first_of(delims, pre_idx ); 209 if(idx != string::npos){ 210 m_sCharset = headerBuf.substr(pre_idx, idx - pre_idx); 211 } 212 } 213 } 214 215 216 //得到网页体编码 217 void CPage::GetContentEncoding(string headerBuf) 218 { 219 string::size_type pre_idx,idx; 220 const string delims("\r\n"); 221 222 CStrFun::Str2Lower( headerBuf, headerBuf.length() ); 223 224 idx = headerBuf.find("content-encoding:"); 225 if (idx != string::npos) 226 { 227 pre_idx = idx + sizeof("content-encoding: ") -1; 228 idx = headerBuf.find_first_of(delims, pre_idx ); 229 if (idx != string::npos) 230 { 231 m_sContentEncoding = headerBuf.substr(pre_idx, idx - pre_idx); 232 } 233 } 234 } 235 236 //得到连接状态 237 void CPage::GetConnectionState(string headerBuf) 238 { 239 string::size_type pre_idx,idx; 240 const string delims(";\r\n"); 241 242 CStrFun::Str2Lower( headerBuf, headerBuf.length() ); 243 244 idx = headerBuf.find("connection:"); 245 if (idx != string::npos) 246 { 247 pre_idx = idx + sizeof("connection: ") -1; 248 idx = headerBuf.find_first_of(delims, pre_idx ); 249 if (idx != string::npos) 250 { 251 string str = headerBuf.substr(pre_idx, idx - pre_idx); 252 //cout << "Connection state: " << str << endl; 253 //if (str == "close") m_bConnectionState = false; 254 if (str == "keep-alive") m_bConnectionState = true; 255 } 256 } 257 } 258 259 //得到网页体类型 260 void CPage::GetContentType(string headerBuf) 261 { 262 string::size_type pre_idx,idx; 263 const string delims(";\r\n"); 264 265 CStrFun::Str2Lower( headerBuf, headerBuf.size() ); 266 267 idx = headerBuf.find("content-type:"); 268 if (idx != string::npos) 269 { 270 pre_idx = idx + sizeof("content-type: ") -1; 271 idx = headerBuf.find_first_of(delims, pre_idx ); 272 if (idx != string::npos) 273 { 274 m_sContentType = headerBuf.substr(pre_idx, idx - pre_idx); 275 } 276 } 277 } 278 279 //得到网页体的传输编码方式 280 void CPage::GetTransferEncoding(string headerBuf) 281 { 282 string::size_type pre_idx,idx; 283 const string delims(";\r\n"); 284 285 CStrFun::Str2Lower( headerBuf, headerBuf.size() ); 286 287 idx = headerBuf.find("transfer-encoding:"); 288 if ( idx != string::npos) 289 { 290 pre_idx = idx + sizeof("transfer-encoding: ") -1; 291 idx = headerBuf.find_first_of(delims, pre_idx ); 292 if(idx != string::npos) 293 { 294 m_sTransferEncoding = headerBuf.substr(pre_idx, idx - pre_idx); 295 } 296 } 297 } 298 299 /* 300 * Filter spam links 301 * If it is, return ture; otherwise false 302 */ 303 //判断一个URL是不是应该过滤,要过滤返回true否则返回false 304 bool CPage::IsFilterLink(string plink) 305 { 306 if( plink.empty() ) return true; 307 if( plink.size() > URL_LEN ) return true; 308 309 string link = plink, tmp; 310 string::size_type idx = 0; 311 312 313 CStrFun::Str2Lower( link, link.length() );//link字符串中的字母全部变成小写 314 315 // find two times following symbols, return false 316 tmp = link; 317 idx = tmp.find("?");//URL中出现2个'?'字符要过滤 318 if( idx != string::npos ){ 319 tmp = tmp.substr(idx+1); 320 idx = tmp.find("?"); 321 if( idx != string::npos ) return true; 322 } 323 324 tmp = link;//先后出现'-'和'+'字符要过滤 325 idx = tmp.find("-"); 326 if( idx != string::npos ){ 327 tmp = tmp.substr(idx+1); 328 idx = tmp.find("+"); 329 if( idx != string::npos ) return true; 330 } 331 332 //出现2个'&'字符要过滤 333 tmp = link; 334 idx = tmp.find("&"); 335 if( idx != string::npos ){ 336 tmp = tmp.substr(idx+1); 337 idx = tmp.find("&"); 338 if( idx != string::npos ) return true; 339 } 340 341 //出现2个"//"字符要过滤 342 tmp = link; 343 idx = tmp.find("//"); 344 if( idx != string::npos ){ 345 tmp = tmp.substr(idx+1); 346 idx = tmp.find("//"); 347 if( idx != string::npos ) return true; 348 } 349 350 //出现2个"http"要过滤 351 tmp = link; 352 idx = tmp.find("http"); 353 if( idx != string::npos ){ 354 tmp = tmp.substr(idx+1); 355 idx = tmp.find("http"); 356 if( idx != string::npos ) return true; 357 } 358 359 //出现2个"misc"要过滤 360 tmp = link; 361 idx = tmp.find("misc"); 362 if( idx != string::npos ){ 363 tmp = tmp.substr(idx+1); 364 idx = tmp.find("misc"); 365 if( idx != string::npos ) return true; 366 } 367 368 //出现2个"ipb"要过滤 369 tmp = link; 370 idx = tmp.find("ipb"); 371 if( idx != string::npos ){ 372 tmp = tmp.substr(idx+1); 373 idx = tmp.find("ipb"); 374 if( idx != string::npos ) return true; 375 } 376 377 const char *filter_str[]={ 378 "cgi-bin", "htbin", "linder", "srs5", "uin-cgi", // robots.txt of http://www.expasy.org/ 379 "uhtbin", "snapshot", "=+", "=-", "script", 380 "gate", "search", "clickfile", "data/scop", "names", 381 "staff/", "enter", "user", "mail", "pst?", 382 "find?", "ccc?", "fwd?", "tcon?", "&", 383 "counter?", "forum", "cgisirsi", "{", "}", 384 "proxy", "login", "00.pl?", "sciserv.pl", "sign.asp", 385 "<", ">", "review.asp?", "result.asp?", "keyword", 386 "\"", "'", "php?s=", "error", "showdate", 387 "niceprot.pl?", "volue.asp?id", ".css", ".asp?month", "prot.pl?", 388 "msg.asp", "register.asp", "database", "reg.asp", "qry?u", 389 "p?msg", "tj_all.asp?page", ".plot.", "comment.php", "nicezyme.pl?", 390 "entr", "compute-map?", "view-pdb?", "list.cgi?", "lists.cgi?", 391 "details.pl?", "aligner?", "raw.pl?", "interface.pl?","memcp.php?", 392 "member.php?", "post.php?", "thread.php", "bbs/", "/bbs" 393 }; 394 int filter_str_num = 75; 395 396 //说明找到了上述字符串要过滤 397 for(int i=0; i<filter_str_num; i++){ 398 if( link.find(filter_str[i]) != string::npos) 399 return true; 400 } 401 402 return false; 403 } 404 405 ///////////////////////////// 406 // just for ImgSE 407 // e.g: http://www.people.com.cn/GB/tupian/index.html 408 // http://news.xinhuanet.com/photo/ 409 // http://photo.tom.com/ 410 ///////////////////////////// 411 // comment previous one and open this one 412 413 /* 414 bool CPage::IsFilterLink(string plink) 415 { 416 if( plink.empty() ) return true; 417 if( plink.size() > URL_LEN ) return true; 418 419 return false; 420 421 string link = plink, tmp; 422 string::size_type idx = 0; 423 424 425 CStrFun::Str2Lower( link, link.length() ); 426 427 const char *filter_str[]={ 428 "tupian", "photo", "ttjstk" 429 }; 430 int filter_str_num = 3; 431 432 CStrFun::Str2Lower( link, link.length() ); 433 434 for(int i=0; i<filter_str_num; i++){ 435 if( link.find(filter_str[i]) != string::npos) 436 return false; 437 } 438 439 return true; 440 } 441 */ 442 443 444 /***************************************************************** 445 ** Function name: ParseHyperLinks 446 ** Input argv: 447 ** -- 448 ** Output argv: 449 ** -- 450 ** Return: 451 true: success 452 false: fail 453 ** Function Description: Parse hyperlinks from the web page 454 ** Version: 1.0 455 ** Be careful: 456 *****************************************************************/ 457 bool CPage::ParseHyperLinks() 458 { 459 if( GetContentLinkInfo() == false ) return false; 460 461 if( m_sContentLinkInfo.empty() ) return false; 462 463 bool bFind4SE = false; 464 bool bFind4History = false; 465 if( GetLinkInfo4SE() ){ 466 if( FindRefLink4SE() ) bFind4SE = true; 467 } 468 469 if( GetLinkInfo4History() ){ 470 if( FindRefLink4History() ) bFind4History = true; 471 } 472 473 //如果没有从网页中提取出为搜索引擎或者为历史网页存档准备的超链接则返回false 474 if( !bFind4SE && !bFind4History ){ 475 return false; 476 } 477 478 //return GetHref(m_sContentLinkInfo.c_str(), "href", m_listLink4SE); 479 480 return true; 481 } 482 483 484 /***************************************************************** 485 ** Function name: GetContentLinkInfo 486 ** Input argv: 487 ** -- 488 ** Output argv: 489 ** -- 490 ** Return: 491 true: success 492 false: fail 493 ** Function Description: Parse hyperlinks from the web page 494 ** Version: 1.0 495 ** Be careful: 496 *****************************************************************/ 497 498 //从网页体中提取出包含超链接信息的标识 499 bool CPage::GetContentLinkInfo() 500 { 501 if( m_sContent.empty() ) return false; 502 503 m_sContentLinkInfo = m_sContent; 504 505 string& s = m_sContentLinkInfo; //引用调用 506 507 // transform all separation into one space character 508 //CStrFun::ReplaceStr(s, "\t", " "); 509 //CStrFun::ReplaceStr(s, "\r", " "); 510 //CStrFun::ReplaceStr(s, "\n", " "); 511 const string delims(" \t\r\n"); 512 string::size_type idx=0, pre_idx; 513 514 //找到所有的"\t\r\n"并将'\t'替换为' ' 如果是\t\t\r\n则删除一个\t 515 while( (idx = s.find_first_of(delims, idx)) != string::npos ) 516 { 517 pre_idx = idx; 518 s.replace(idx,1,1,' '); 519 idx++; 520 521 while( (idx = s.find_first_of(delims, idx)) != string::npos ) 522 { 523 if( idx-pre_idx == 1 ){ 524 s.erase(idx, 1); 525 } else { 526 break; 527 } 528 } 529 530 idx--; 531 } 532 533 // transform all "<br>" into one space character 534 //将s中<br>标记全部替换为空格 535 CStrFun::ReplaceStr(s, "<br>", " "); 536 537 if( s.size() < 20 ) return false; 538 539 // Keep only <img ...>, <area ...>,<script ...> and <a href ...> tags. 540 string::size_type idxHref=0,idxArea=0,idxImg=0; 541 string dest; 542 543 do{ 544 if( s.empty() ) break; 545 546 idxHref = CStrFun::FindCase(s, "href"); 547 idxArea = CStrFun::FindCase(s, "<area"); 548 idxImg = CStrFun::FindCase(s, "<img"); 549 550 pre_idx = idxHref > idxArea? idxArea: idxHref; 551 pre_idx = idxImg > pre_idx? pre_idx: idxImg; 552 if( pre_idx == string::npos) break; 553 554 s = s.substr(pre_idx); 555 idx = s.find_first_of('<',1); 556 if( idx != string::npos ){ 557 dest = dest + s.substr(0,idx); 558 }else{ 559 break; 560 } 561 562 s = s.substr(idx); 563 idxHref=0; idxArea=0; idxImg=0; 564 }while(1); 565 566 s = dest; 567 568 569 /* erase all '\' character 570 * too avoid the following situations: 571 * document.write("<A href=\"/~webg/refpaper/index.html\">t2</A>"); 572 */ 573 CStrFun::EraseStr(s, "\\"); 574 575 if( s.size() < 20 ) return false; 576 577 return true; 578 } 579 580 /***************************************************************** 581 ** Function name: GetLinkInfo4SE() 582 ** Input argv: 583 ** -- 584 ** Output argv: 585 ** -- 586 ** Return: 587 true: success 588 false: fail 589 ** Function Description: Get links for SE 590 ** Version: 1.0 591 ** Be careful: 592 *****************************************************************/ 593 594 //再从m_sContentLinkInfo提取出为搜索引擎准备的超链接 595 bool CPage::GetLinkInfo4SE() 596 { 597 598 if( m_sContentLinkInfo.empty() ) return false; 599 600 m_sLinkInfo4SE = m_sContentLinkInfo; 601 string& s = m_sLinkInfo4SE; 602 603 // Keep only <area ...>,and <a href ...> tags. 604 string::size_type idxHref=0,idxArea=0, 605 idx,pre_idx; 606 string dest; 607 608 609 610 611 612 613 614 615 /* 616 617 例如:上面的m_sContentLinkInfo=href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ<img src="http://www.google.com.hk"> 618 619 我们这里提取出href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ 过滤掉<img src="http://www.google.com.hk"> 620 621 因为<img src="http://www.google.com.hk">的超链接是为历史网页存档准备的超链接 622 623 */ 624 625 626 do{ 627 if( s.empty() ) break; 628 629 //idxHref = CStrFun::FindCase(s, "<a href"); 630 idxHref = CStrFun::FindCase(s, "href"); 631 idxArea = CStrFun::FindCase(s, "<area "); 632 633 pre_idx = idxHref > idxArea? idxArea: idxHref; 634 //pre_idx = idxHref; 635 if( pre_idx == string::npos) break;//终止条件 636 637 s = s.substr(pre_idx); 638 idx = s.find_first_of('<',1); 639 640 if( !(s.length() < 4) ) 641 { 642 idxHref = CStrFun::FindCaseFrom(s, "href", 4); 643 idx = idx > idxHref ? idxHref: idx; 644 } 645 646 if( idx != string::npos ){ 647 dest = dest + s.substr(0,idx); 648 }else if (idx == string::npos && pre_idx != string::npos){ 649 dest = dest + s; 650 break; 651 }else{ 652 break; 653 } 654 655 s = s.substr(idx); 656 idxHref=0; idxArea=0; 657 }while(1); 658 659 s = dest;//dest保存着过滤后的数据 660 if( s.length() < 20 ) return false; 661 662 663 // erase all '"' , '\'', " ". 664 CStrFun::EraseStr(s, "\""); 665 CStrFun::EraseStr(s, "'"); 666 CStrFun::EraseStr(s, " "); 667 668 // Keep URLs and anchor text. 669 670 idxHref=0; 671 const string delims( " #>"); 672 dest.clear(); 673 674 675 676 /* 677 678 通过上面的提取我们得到href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ 679 680 我们再次提取 681 682 m_sLinkInfo4SE="http://www.baidu.com/">百度"http://www.qq.com/">QQ 683 684 */ 685 686 687 688 do{ 689 if( s.empty() ) break; 690 idxHref = CStrFun::FindCase(s, "href"); 691 692 if( idxHref == string::npos) break; 693 pre_idx = idxHref; 694 695 //#### 696 idx = s.find('=', idxHref); 697 if( idx == string::npos ) break; 698 s = s.substr(idx+1); 699 700 while( s.length() > 0 && s[0] == ' ' ){ 701 s.erase(0,1); 702 } 703 if( s.length() == 0 ) break; 704 705 idx = s.find_first_of(delims,1); 706 //cout << endl << s.substr(0, idx) << endl; 707 if( idx == string::npos ) break; 708 709 dest += '"' + s.substr(0, idx); 710 711 //cout << endl << dest << endl; 712 713 idx = s.find('>'); 714 if( idx == string::npos ) break; 715 dest += '>'; 716 s = s.substr(idx +1); 717 718 idx = s.find('<'); 719 720 if( !s.empty() ){ 721 idxHref = CStrFun::FindCase(s, "href"); 722 idx = idx > idxHref ? idxHref: idx; 723 } 724 725 if( idx == string::npos ){ 726 dest += s; 727 break; 728 } 729 730 /* 731 if( idx == idxHref ){ 732 dest += '"' + s.substr(0,idx); 733 }else{ 734 */ 735 dest += s.substr(0,idx); 736 //} 737 //#### 738 739 idxHref=0; 740 }while(1); 741 742 // look for empty filenames. 743 idx = 0; 744 while( (idx = dest.find("\"\"",idx)) != string::npos ){ 745 dest.erase(idx, 1); 746 } 747 748 s = dest; 749 750 return( s.length() < 20 ? false: true ); 751 752 } 753 754 /***************************************************************** 755 ** Function name: GetLinkInfo4History() 756 ** Input argv: 757 ** -- 758 ** Output argv: 759 ** -- 760 ** Return: 761 true: success 762 false: fail 763 ** Function Description: Get links for history archiving 764 ** Version: 1.0 765 ** Be careful: 766 *****************************************************************/ 767 bool CPage::GetLinkInfo4History() 768 { 769 /* 770 771 例如:上面的m_sContentLinkInfo=href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ<img src="http://www.google.com.hk"> 772 773 我们这里提取出<img src="http://www.google.com.hk"> 过滤掉href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ 774 775 因为href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ的超链接是为搜索引擎准备的超链接 776 777 */ 778 779 780 781 782 783 784 if( m_sContentLinkInfo.empty() ) return false; 785 786 m_sLinkInfo4History = m_sContentLinkInfo; 787 string& s = this->m_sLinkInfo4History; 788 789 // Keep only <img ...> tags. 790 string::size_type idxImg=0, 791 idx,pre_idx; 792 string dest; 793 794 do{ 795 if( s.empty() ) break; 796 idxImg = CStrFun::FindCase(s, "<img"); 797 798 pre_idx = idxImg; 799 if( pre_idx == string::npos) break; 800 801 s = s.substr(pre_idx); 802 idx = s.find_first_of('<',1); 803 804 if( idx != string::npos ){ 805 dest = dest + s.substr(0,idx); 806 }else if (idx == string::npos && pre_idx != string::npos){ 807 dest = dest + s; 808 break; 809 }else{ 810 break; 811 } 812 813 s = s.substr(idx); 814 idxImg=0; 815 }while(1); 816 817 s = dest; 818 if( s.length() < 20 ) return false; 819 820 // erase all '"'. '\''," ". 821 CStrFun::EraseStr(s , "\""); 822 CStrFun::EraseStr(s , "'"); 823 CStrFun::EraseStr(s , " "); 824 825 // Keep URLs and anchor text. 826 827 idxImg=0; 828 string::size_type idxSrc = 0; 829 const string delims( " #>"); 830 dest.clear(); 831 832 833 /* 834 835 通过上面的提取我们得到<img src="http://www.google.com.hk"> 836 837 我们再次提取 838 839 m_sLinkInfo4History="http://www.google.com.hk> 840 841 */ 842 843 do{ 844 if( s.empty() ) break; 845 idxImg = CStrFun::FindCase(s, "img"); 846 847 if( idxImg == string::npos) break; 848 pre_idx = idxImg; 849 850 s = s.substr(idxImg+3); // skip "img" 851 852 //#### 853 idx = s.find('>', idxImg); 854 if( idxImg == string::npos) break; 855 if( s.empty() ) break; 856 idxSrc = CStrFun::FindCase(s, "src"); 857 if( idxSrc > idxImg ) continue; 858 s = s.substr(idxSrc); 859 860 idx = s.find('=', idxImg); 861 if( idx == string::npos ) break; 862 s = s.substr(idx+1); 863 864 while( s.length() > 0 && s[0] == ' ' ){ 865 s.erase(0,1); 866 } 867 if( s.length() == 0 ) break; 868 869 idx = s.find_first_of(delims,1); 870 if( idx == string::npos ) break; 871 872 if( s.at(0) == '"'){ 873 dest += s.substr(0, idx); 874 }else{ 875 dest += '"' + s.substr(0, idx); 876 } 877 878 idx = s.find('>'); 879 if( idx == string::npos ) break; 880 dest += '>'; 881 s = s.substr(idx +1); 882 883 idx = s.find('<'); 884 if( idx == string::npos ){ 885 dest += s; 886 break; 887 } 888 dest += s.substr(0,idx); 889 //#### 890 891 idxImg=0; 892 }while(1); 893 894 895 // look for empty filenames. 896 idx = 0; 897 while( (idx = dest.find("\"\"",idx)) != string::npos ){ 898 dest.erase(idx, 1); 899 } 900 901 s = dest; 902 903 return( s.length() < 20 ? false: true ); 904 905 } 906 907 908 909 910 //判断strUrl是不是正规的url 911 bool CPage::NormalizeUrl(string& strUrl) 912 { 913 string::size_type idx; 914 915 916 //URL没有htp://协议名我们这里认为strUrl不是正规的URL 917 if( CStrFun::FindCase(strUrl, "http://") == string::npos ) return false; 918 919 // convert "http://e.pku.cn" to "http://e.pku.cn/" 920 //将http://www.baidu.com转化为http://www.baidu.com/ 921 idx = strUrl.rfind('/'); 922 if( idx < 8 ) { 923 strUrl = strUrl + "/"; 924 return true; 925 } 926 927 //将"/./"-->"/" 928 while( (idx=strUrl.find("/./")) != string::npos ){ 929 if( idx != string::npos ) strUrl.erase(idx,2); 930 } 931 932 //将"xxx/x/../yyy"-->xxx/yyy 933 while( (idx = strUrl.find("/../")) != string::npos ){ 934 string strPre,strSuf; 935 936 strPre = strUrl.substr(0, idx); 937 938 if( strUrl.length() > idx+4 ) 939 strSuf = strUrl.substr(idx+4); 940 941 idx = strPre.rfind("/"); 942 if( idx != string::npos) 943 strPre = strPre.substr(0,idx+1); 944 if( strPre.length() < 10 ) return false; 945 946 strUrl = strPre + strSuf; 947 } 948 949 if( CStrFun::FindCase(strUrl, "http://") != 0 ) return false; 950 951 return true; 952 } 953 954 955 956 957 958 959 960 /*最终得到为搜索引擎准备的超链接 961 962 并将相对路径的URL和绝对路径的URL分别处理,同时,我们发现从一个网页中提取的超链接可以是相同的,这个时候 963 964 我们必须去重,这个函数用map容器很好的做到了这一点 965 966 还有一些URL不是正规的URL也要过滤 967 968 还有一些URL是必要过滤也要过滤--通过IsFilterLink(string strUrl)实现 969 970 */ 971 bool CPage::FindRefLink4SE() 972 { 973 if( m_sLinkInfo4SE.empty() ) return false; 974 975 char *buffer = (char*)m_sLinkInfo4SE.c_str(); 976 int urlnum=0,len; 977 char *ptr ; 978 979 static char buf[URL_REFERENCE_LEN]; 980 981 memset(buf, 0, URL_REFERENCE_LEN); 982 len = strlen(buffer); 983 if( len < 8 ) return false; 984 985 len = len < URL_REFERENCE_LEN -1 ? len : URL_REFERENCE_LEN - 1;//len记录相对较小的值 986 strncpy( buf, buffer, len); 987 988 /*first 989 *------> 990 */ 991 992 993 /* 994 995 例如:m_sLinkInfo4SE="http://www.baidu.com/">百度"http://www.qq.com/">QQ 996 997 我们这里提取为 998 999 http://www.baidu.com 百度 1000 1001 http://www.qqq.com QQ 1002 1003 */ 1004 ptr = buf; 1005 while( ptr - buf < len && *ptr ) 1006 { 1007 while( *ptr == '"' && *ptr) ptr++; 1008 if ( !*ptr ) break; 1009 this->m_RefLink4SE[ urlnum].link = ptr;//每个网页里最多有1000个链接 1010 while( *ptr && *ptr != '>') 1011 { 1012 //在遇到'>'之前,出现了' '字符,我们必须将' '字符赋值为'\0'说明URL提取完了,因为URL不可能出现' '字符 1013 if(*ptr == ' ') *ptr = '\0'; 1014 //例如: "http://www.baidu.com/" height=100 width=150>百度 出现空格说明还有其他的属性值 1015 ptr++; 1016 } 1017 1018 if ( !*ptr ){ 1019 urlnum++; 1020 break; 1021 } 1022 if ( *ptr == '>' ) 1023 { 1024 *ptr++='\0'; 1025 if( !*ptr ) 1026 { 1027 urlnum++; 1028 break; 1029 } 1030 1031 if( *ptr == '"' ) 1032 { 1033 this->m_RefLink4SE[urlnum].anchor_text = NULL; 1034 } 1035 else 1036 { 1037 this->m_RefLink4SE[urlnum].anchor_text = ptr; 1038 while( *ptr && *ptr != '"') ptr++; 1039 if (!*ptr) 1040 { 1041 urlnum++; 1042 break; 1043 } 1044 if ( *ptr == '"') *ptr='\0'; 1045 } 1046 1047 } 1048 1049 //cout << endl << this->m_RefLink4SE[ urlnum].link << '\t'; 1050 //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl; 1051 1052 ptr++; 1053 urlnum++; 1054 if ( urlnum == MAX_URL_REFERENCES) break; //达到最多的url数目 1055 } 1056 //cout << endl << this->m_RefLink4SE[ urlnum].link << endl; 1057 //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl; 1058 1059 this->m_nRefLink4SENum = urlnum; 1060 1061 /*second 1062 *------> 1063 */ 1064 //typedef map<string,string,less<string> >::value_type valType; 1065 typedef map<string,string>::value_type valType; 1066 1067 m_mapLink4SE.clear(); 1068 1069 //string strRootUrl= m_sUrl; 1070 CUrl iUrl; 1071 if( iUrl.ParseUrlEx(m_sUrl) == false ) 1072 { 1073 cout << "ParseUrlEx error in FindRefLink4SE(): " << m_sUrl << endl; 1074 return false; 1075 } 1076 1077 for(int i=0; i<m_nRefLink4SENum; i++) 1078 { 1079 1080 string str; 1081 string::size_type idx; 1082 const string delims(" #"); 1083 1084 str = m_RefLink4SE[i].link; 1085 idx = str.find_first_of(delims, 0 ); 1086 if( idx != string::npos )//如果找到标志 1087 { 1088 str = str.substr(0, idx);//只取#前边的url 1089 } 1090 if( str.size() == 0 || str.size() > URL_LEN - 1 || str.size() < 4 ) 1091 continue; 1092 1093 1094 string::size_type idx1; 1095 idx1 = CStrFun::FindCase(str, "http"); 1096 if( idx1 != 0 )//str有可能是相对路径 1097 { 1098 char c1 = m_sUrl.at(m_sUrl.length()-1); 1099 char c2 = str.at(0); 1100 1101 if( c2=='/' )//str一定是相对路径 1102 { 1103 if( iUrl.m_nPort != 80 )//若是http 1104 { 1105 cout << iUrl.m_sHost << endl; 1106 cout << str << endl; 1107 //str = "http://" + iUrl.m_sHost + ":" + (const char*)(iUrl.m_nPort) + str; 1108 str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str; 1109 } 1110 else 1111 { 1112 str = "http://" + iUrl.m_sHost + str; 1113 } 1114 } 1115 else if( c1!='/' && c2!='/')//若两个都不是,则加上/构成新的url 1116 { 1117 string::size_type idx; 1118 1119 idx = m_sUrl.rfind('/'); 1120 if( idx != string::npos )//若不是最后 1121 { 1122 if( idx > 6 ) 1123 { // > strlen("http://..") 1124 str = m_sUrl.substr(0, idx+1) + str; 1125 } 1126 else 1127 { 1128 str = m_sUrl + "/" + str; 1129 } 1130 1131 } else { 1132 1133 continue; 1134 } 1135 1136 } 1137 else 1138 { 1139 if( c1=='/' ) 1140 { 1141 str = m_sUrl + str; 1142 } 1143 else 1144 { 1145 str = m_sUrl + "/" + str; 1146 } 1147 } 1148 } 1149 1150 if( NormalizeUrl(str) == false ) continue; 1151 1152 if( IsFilterLink(str) ) continue; 1153 1154 //debug 1155 //cout << "reflink: " << reflink << endl; 1156 1157 if( str == m_sUrl )//一个网页中提取的超链接是其本身,我就不要了,因为我们已经有了这个网页的URL了 1158 { 1159 continue; 1160 } 1161 else 1162 { 1163 if( m_RefLink4SE[i].anchor_text )//有URL的描述符 1164 { 1165 if( m_mapLink4SE.find(str) == m_mapLink4SE.end() ) 1166 { 1167 m_mapLink4SE.insert( valType( str, m_RefLink4SE[i].anchor_text)); 1168 } 1169 } 1170 else//没有URL的描述符---这个时候描述符为'\0' 1171 { 1172 if( m_mapLink4SE.find(str) == m_mapLink4SE.end() ) 1173 { 1174 m_mapLink4SE.insert( valType( str, "\0") ); 1175 cout << "."; 1176 } 1177 } 1178 } 1179 1180 1181 } 1182 1183 m_nRefLink4SENum = m_mapLink4SE.size(); 1184 1185 //cout << endl; 1186 1187 return true; 1188 } 1189 1190 1191 1192 1193 1194 //最终得到为历史网页存档准备的超链接 1195 1196 //并将相对路径的URL和绝对路径的URL分别处理,同时,我们发现从一个网页中提取的超链接可以是相同的,这个时候 1197 1198 //我们必须去重,这个函数用vector容器很好的做到了这一点 1199 1200 //还有一些URL不是正规的URL也要过滤 1201 1202 //还有一些URL是必要过滤也要过滤--通过IsFilterLink(string strUrl)实现 1203 bool CPage::FindRefLink4History() 1204 { 1205 if( m_sLinkInfo4History.empty() ) return false; 1206 1207 char *buffer = (char*)m_sLinkInfo4History.c_str(); 1208 int urlnum=0,len; 1209 char *ptr ; 1210 1211 static char buf[URL_REFERENCE_LEN/2]; 1212 1213 memset(buf, 0, URL_REFERENCE_LEN/2); 1214 len = strlen(buffer); 1215 if( len < 8 ) return false; 1216 1217 len = len < URL_REFERENCE_LEN/2 - 1? len : URL_REFERENCE_LEN/2 -1; 1218 strncpy( buf, buffer, len); 1219 1220 /*first 1221 *------> 1222 */ 1223 ptr = buf; 1224 while( ptr - buf < len && *ptr ){ 1225 while( *ptr == '"' && *ptr) ptr++; 1226 if ( !*ptr ) break; 1227 this->m_RefLink4History[ urlnum].link = ptr; 1228 1229 while( *ptr && *ptr != '>'){ 1230 if( *ptr == ' ') *ptr='\0'; 1231 ptr++; 1232 } 1233 1234 if( !*ptr){ 1235 urlnum++; 1236 break; 1237 } 1238 if( *ptr == '>' ){ 1239 *ptr++ = 0; 1240 if( !*ptr ){ 1241 urlnum++; 1242 break; 1243 } 1244 if( *ptr == '"' ){ 1245 1246 }else{ 1247 while( *ptr && *ptr != '"') ptr++; 1248 if( !*ptr ){ 1249 urlnum++; 1250 break; 1251 } 1252 if ( *ptr == '"' ) *ptr++='\0'; 1253 } 1254 } 1255 1256 ptr++; 1257 urlnum++; 1258 if ( urlnum == MAX_URL_REFERENCES/2) break; 1259 } 1260 1261 1262 this->m_nRefLink4HistoryNum = urlnum; 1263 1264 /*second 1265 *------> 1266 */ 1267 m_vecLink4History.clear(); 1268 //string strRootUrl= m_sUrl; 1269 CUrl iUrl; 1270 if( iUrl.ParseUrlEx(m_sUrl) == false ){ 1271 cout << "ParseUrlEx error in FindRefLink4History(): " << m_sUrl << endl; 1272 return false; 1273 } 1274 1275 for(int i=0; i<m_nRefLink4HistoryNum; i++){ 1276 string str; 1277 //string::size_type idx; 1278 1279 str = m_RefLink4History[i].link; 1280 if( str.size()==0 || str.size() > URL_LEN - 1 1281 || str.size() < 4 ) continue; 1282 1283 /* 1284 char *pdest1, *pdest2; 1285 pdest1 = strstr( str.c_str(), "http" ); 1286 pdest2 = strstr( str.c_str(), "HTTP" ); 1287 if( pdest1==NULL && pdest2==NULL ){ 1288 */ 1289 1290 string::size_type idx1; 1291 idx1 = CStrFun::FindCase(str, "http"); 1292 if( idx1 != 0 ){ 1293 char c1 = m_sUrl.at(m_sUrl.length()-1); 1294 char c2 = str.at(0); 1295 1296 if( c2=='/' ){ 1297 if( iUrl.m_nPort != 80 ){ 1298 str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str; 1299 } else { 1300 str = "http://" + iUrl.m_sHost + str; 1301 } 1302 } else if( c1!='/' && c2!='/'){ 1303 string::size_type idx; 1304 1305 idx = m_sUrl.rfind('/'); 1306 if( idx != string::npos ){ 1307 if( idx > 6 ){ // > strlen("http://..") 1308 str = m_sUrl.substr(0, idx+1) + str; 1309 } else { 1310 str = m_sUrl + "/" + str; 1311 } 1312 1313 } else { 1314 1315 continue; 1316 } 1317 1318 } else { 1319 if( c1=='/' ){ 1320 str = m_sUrl + str; 1321 } else { 1322 str = m_sUrl + "/" + str; 1323 } 1324 } 1325 } 1326 1327 // due to bad link parser 1328 /* 1329 1330 idx = reflink.find(' '); 1331 if(idx != string::npos){ 1332 reflink = reflink.substr(0,idx); 1333 } 1334 idx = reflink.find('"'); 1335 if(idx != string::npos){ 1336 reflink = reflink.substr(0,idx); 1337 } 1338 */ 1339 //############# 1340 1341 if( NormalizeUrl(str) == false ) continue; 1342 1343 1344 if( IsFilterLink(str) ) continue; 1345 1346 1347 if( str == m_sUrl ){ 1348 continue; 1349 }else{ 1350 vector<string>::iterator it; 1351 it = find(m_vecLink4History.begin(), m_vecLink4History.end(),str); 1352 if( it == m_vecLink4History.end() ){ 1353 1354 m_vecLink4History.push_back( str); 1355 cout << "."; 1356 } 1357 } 1358 1359 1360 } 1361 m_nRefLink4HistoryNum = m_vecLink4History.size(); 1362 //cout << endl; 1363 1364 return true; 1365 }
posted on 2012-07-05 15:00 kakamilan 阅读(2457) 评论(0) 编辑 收藏 举报