1 #ifndef _HTTP_H_031105_
2 #define _HTTP_H_031105_
3
4 #include <map>
5
6 using namespace std;
7
8 class CHttp
9 {
10 private:
11 string m_strUrl; // url
12 int *m_sock; // socket
13
14 public:
15 CHttp();
16 virtual ~CHttp();
17
18
19 //strUrl: 待抓取的网页对应的URL
20 //fileBuf: 网页体信息
21 //fileHead:网页头信息
22 //location:网页如果重定向对应的URL
23 //sock:套接子文件描述符
24 int Fetch(string strUrl, char **fileBuf,
25 char **fileHead, char **location, int* sock);
26
27 private:
28 //下面4个私有的成员函数--被 Fetch()函数调用
29 //通过IO复用的方法读取网页头信息
30 int read_header(int sock, char *headerPtr);
31
32 //创建套接字文件描述符
33 int CreateSocket(const char *host, int port);
34
35 //被CreateSocket()调用,通过IO复用的方法连接目标服务器
36 int nonb_connect(int, struct sockaddr*, int);
37
38 //检测*buf所指的内存空间剩余值是否大于more,不够再加more+1单位的内存空间
39 int checkBufSize(char **buf, int *bufsize, int more);
40
41 };
42
43 extern pthread_mutex_t mutexMemory;
44
45 #endif /* _HTTP_H_031105_ */
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <strings.h>
5 #include <errno.h>
6 #include <netdb.h>
7 #include <unistd.h>
8 #include <netinet/in.h>
9 #include <sys/types.h>
10 #include <sys/socket.h>
11 #include <sys/time.h>
12 #include <fcntl.h>
13 #include <iostream>
14 #include "Http.h"
15
16 //#include "Tse.h"
17 #include "CommonDef.h"
18 #include "Url.h"
19 //#include "Page.h"
20 #include "StrFun.h"
21
22 char *userAgent = NULL;
23 int timeout = DEFAULT_TIMEOUT;//设置最长的等待时间30秒
24 int hideUserAgent = 0;
25
26 CHttp::CHttp()
27 {
28 }
29
30 CHttp::~CHttp()
31 {
32 }
33
34
35 /*
36 * Actually downloads the page, registering a hit (donation)
37 * If the fileBuf passed in is NULL, the url is downloaded and then
38 * freed; otherwise the necessary space is allocated for fileBuf.
39 * Returns size of download on success,
40 -1 on error is set,
41 -2 out of ip block,
42 -3 invalid host,
43 -4 MIME is imag/xxx
44 -300 on 301.
45 */
46
47
48
49 /*
50
51 function:
52
53 success: return bytesRead[网页体信息的真实的字节数]
54
55 fail: return -1 各种其他的错误
56
57 return -2 在IP阻塞范围内
58
59 return -3 无效的主机号
60
61 return -4 image/text类型
62
63 return -300 网页重定向
64
65 strUrl: 待抓取的网页对应的URL
66
67 fileBuf: 网页体信息
68
69 fileHead:网页头信息
70
71 location:网页如果重定向对应的URL
72
73 sock:套接子文件描述符
74
75 */
76 int CHttp::Fetch(string strUrl, char **fileBuf, char **fileHeadBuf, char **location, int* nPSock )
77 {
78 char *tmp, *url, *requestBuf, *pageBuf;
79 const char *host, *path;
80 int sock, bytesRead = 0, bufsize = REQUEST_BUF_SIZE;
81 int ret = -1, tempSize, selectRet;
82 int port = 80;
83
84
85 if( strUrl.empty() )//空的URL肯定不能抓取到网页
86 {
87 cout << "strUrl is NULL" << endl;
88 return -1;
89 }
90
91 /* Copy the url passed in into a buffer we can work with, change, etc. */
92 /*
93 url = (char*)malloc(strUrl.length()+1);
94 if( url == NULL ){
95 cout << "can not allocate enought memory for url" << endl;
96 return -1;
97 } else {
98 memset(url, 0,strUrl.length()+1);
99 memcpy(url, strUrl.c_str(), strUrl.length() );
100 }
101 */
102 //pthread_mutex_lock(&mutexMemory);
103 url = strdup(strUrl.c_str());//复制url
104 //pthread_mutex_unlock(&mutexMemory);
105 if( url == NULL )//分配失败
106 {
107 cout << "!error: stdup() in Fetch()" << endl;
108 return -1;
109 }
110
111 // parse the url
112 CUrl u;
113 if( u.ParseUrlEx(url) == false )
114 {
115 //如果没有"http://"协议号,肯定会解析错误
116 cout << "ParseUrlEx error in Fetch(): " << strUrl << endl;
117 return -1;
118 }
119
120 host = u.m_sHost.c_str();
121 path = u.m_sPath.c_str();
122 if( u.m_nPort > 0 ) port = u.m_nPort;
123
124 /* Compose a request string */
125 //pthread_mutex_lock(&mutexMemory);
126
127 /*构造HTTP请求报文: 假设strUrl="http://www.baidu.com/ecjtu/nihao.html"*/
128 // GET /ecjtu/nihao.html HTTP/1.0\r\n
129 requestBuf = (char*)malloc(bufsize);
130 //pthread_mutex_unlock(&mutexMemory);
131 if(requestBuf == NULL)
132 {
133 if (url)
134 {
135 //pthread_mutex_lock(&mutexMemory);
136 free(url);
137 url=NULL;
138 //pthread_mutex_unlock(&mutexMemory);
139 }
140 cout << "can not allocate enought memory for requestBuf" << endl;
141 return -1;
142 }
143 requestBuf[0] = 0;
144
145 if( strlen(path) < 1 )//说明请求的是根目录下的网页
146 {
147 // GET / HTTP/1.0\r\n
148 /* The url has no '/' in it, assume the user is making a root-level
149 * request */
150 tempSize = strlen("GET /") + strlen(HTTP_VERSION) +2;
151 /*
152 if( tempSize > bufsize ){
153 free(url);
154 free(requestBuf);
155 cout << "tempSize larger than bufsize" << endl;
156 return -1;
157 }
158 */
159
160 if(checkBufSize(&requestBuf, &bufsize, tempSize) || snprintf(requestBuf, bufsize, "GET / %s\r\n", HTTP_VERSION) < 0 ){
161 /*int snprintf(char *restrict buf, size_t n, const char * restrict format, ...);
162 函数说明:最多从源串中拷贝n-1个字符到目标串中,然后再在后面加一个0。所以如果目标串的大小为n
163 的话,将不会溢出。*/
164
165 //pthread_mutex_lock(&mutexMemory);
166 if (url)
167 {
168 free(url);
169 url=NULL;
170 }
171 if (requestBuf)
172 {
173 free(requestBuf);
174 requestBuf=NULL;
175 }
176 //pthread_mutex_unlock(&mutexMemory);
177 cout << "1.checkBuffSize(&requestBuf..) error" << endl;
178 return -1;
179 }
180
181 }
182 else//说明请求的是非根目录下的网页
183 {
184 tempSize = strlen("GET ") + strlen(path) + strlen(HTTP_VERSION) + 4;
185
186 if(checkBufSize(&requestBuf, &bufsize, tempSize) || snprintf(requestBuf, bufsize, "GET %s %s\r\n", path, HTTP_VERSION) < 0)
187 {
188
189 //pthread_mutex_lock(&mutexMemory);
190 if (url)
191 {
192 free(url);
193 url=NULL;
194 }
195 if (requestBuf)
196 {
197 free(requestBuf);
198 requestBuf=NULL;
199 }
200 //pthread_mutex_unlock(&mutexMemory);
201 cout << "2._checkBuffSize(&requestBuf..) error" << endl;
202 return -1;
203 }
204
205 }
206
207
208 /* Use Host: even though 1.0 doesn't specify it. Some servers
209 * won't play nice if we don't send Host, and it shouldn't hurt anything */
210 tempSize = (int)strlen("Host: ") + (int)strlen(host) + 3;/* +3 for "\r\n\0" */
211
212 if(checkBufSize(&requestBuf, &bufsize, tempSize + 128)){
213 //pthread_mutex_lock(&mutexMemory);
214 if (url)
215 {
216 free(url); url=NULL;
217 }
218 if (requestBuf)
219 {
220 free(requestBuf); requestBuf=NULL;
221 }
222 //pthread_mutex_unlock(&mutexMemory);
223 cout << "3._checkBuffSize(&requestBuf..) error" << endl;
224 return -1;
225 }
226
227 strcat(requestBuf, "Host: ");
228 strcat(requestBuf, host);
229 strcat(requestBuf, "\r\n");
230
231 if(!hideUserAgent && userAgent == NULL) {
232
233 tempSize = (int)strlen("User-Agent: ") +
234 (int)strlen(DEFAULT_USER_AGENT) + (int)strlen(VERSION) + 4;
235 if(checkBufSize(&requestBuf, &bufsize, tempSize)) {
236 //pthread_mutex_lock(&mutexMemory);
237 if (url)
238 {
239 free(url); url=NULL;
240 }
241 if (requestBuf)
242 {
243 free(requestBuf); requestBuf=NULL;
244 }
245 //pthread_mutex_unlock(&mutexMemory);
246 cout << "4._checkBuffSize(&requestBuf..) error" << endl;
247 return -1;
248 }
249 strcat(requestBuf, "User-Agent: ");
250 strcat(requestBuf, DEFAULT_USER_AGENT);
251 strcat(requestBuf, "/");
252 strcat(requestBuf, VERSION);
253 strcat(requestBuf, "\r\n");
254
255 } else if(!hideUserAgent) {
256
257 tempSize = (int)strlen("User-Agent: ") + (int)strlen(userAgent) + 3;
258 if(checkBufSize(&requestBuf, &bufsize, tempSize)) {
259
260 //pthread_mutex_lock(&mutexMemory);
261 if (url)
262 {
263 free(url); url=NULL;
264 }
265 if (requestBuf)
266 {
267 free(requestBuf); requestBuf=NULL;
268 }
269 //pthread_mutex_unlock(&mutexMemory);
270 cout << "5._checkBuffSize(&requestBuf..) error" << endl;
271 return -1;
272 }
273 strcat(requestBuf, "User-Agent: ");
274 strcat(requestBuf, userAgent);
275 strcat(requestBuf, "\r\n");
276 }
277
278 //tempSize = (int)strlen("Connection: Close\n\n");
279 tempSize = (int)strlen("Connection: Keep-Alive\r\n\r\n");
280 if(checkBufSize(&requestBuf, &bufsize, tempSize)) {
281 //pthread_mutex_lock(&mutexMemory);
282 if (url)
283 {
284 free(url); url=NULL;
285 }
286 if (requestBuf)
287 {
288 free(requestBuf); requestBuf=NULL;
289 }
290 //pthread_mutex_unlock(&mutexMemory);
291 cout << "6._checkBuffSize(&requestBuf..) error" << endl;
292 return -1;
293 }
294
295
296 //strcat(requestBuf, "Connection: Close\n\n");
297 strcat(requestBuf, "Connection: Keep-Alive\r\n\r\n");
298
299
300 /* Now free any excess memory allocated to the buffer */
301 //pthread_mutex_lock(&mutexMemory);
302 //重新调整requestBuf的内存空间,释放多余的内存空间
303 tmp = (char *)realloc(requestBuf, strlen(requestBuf) + 1);
304 //pthread_mutex_unlock(&mutexMemory);
305 if(tmp == NULL){
306 //pthread_mutex_lock(&mutexMemory);
307 if (url)
308 {
309 free(url); url=NULL;
310 }
311 if (requestBuf)
312 {
313 free(requestBuf); requestBuf=NULL;
314 }
315 //pthread_mutex_unlock(&mutexMemory);
316 cout << "realloc for tmp error" << endl;
317 return -1;
318 }
319 requestBuf = tmp;
320
321 if( *nPSock != -1 ){
322 sock = *nPSock;
323 cout << "using privous socket " << *nPSock << endl;
324 }else{
325
326 // cout << "1.get a new one" << endl;
327 sock = CreateSocket( host, port );
328 if(sock == -1) { // invalid host
329 //pthread_mutex_lock(&mutexMemory);
330 if (url)
331 {
332 free(url); url=NULL;
333 }
334 if (requestBuf)
335 {
336 free(requestBuf); requestBuf=NULL;
337 }
338 //pthread_mutex_unlock(&mutexMemory);
339 return -3;
340 }
341 if(sock == -2) { // out of ip block
342 //pthread_mutex_lock(&mutexMemory);
343 if (url)
344 {
345 free(url); url=NULL;
346 }
347 if (requestBuf)
348 {
349 free(requestBuf); requestBuf=NULL;
350 }
351 //pthread_mutex_unlock(&mutexMemory);
352 //cout << "2.not able to MakeSocket" << endl;
353 return -2;
354 }
355 }
356
357
358
359 ret = write(sock, requestBuf, strlen(requestBuf));
360 if( ret == 0 ){
361 cout << "requestBuf is " << requestBuf << endl;
362 cout << "write nothing" << endl;
363 //pthread_mutex_lock(&mutexMemory);
364 if (url)
365 {
366 free(url); url=NULL;
367 }
368 if (requestBuf)
369 {
370 free(requestBuf); requestBuf=NULL;
371 }
372 //pthread_mutex_unlock(&mutexMemory);
373 close(sock);
374 *nPSock = -1;
375 return -1;
376
377 }
378 if( ret == -1){
379 //cout << "write error" << endl;
380 // sock is invalid,we should make a new one
381 close(sock);
382 *nPSock = -1;
383
384 cout << "2.close previous socket " << *nPSock << " and get a new one" << endl;
385 //maybe sock is dead,try again
386 sock = CreateSocket( host, port );
387 if(sock == -1) {
388 //pthread_mutex_lock(&mutexMemory);
389 if (url)
390 {
391 free(url); url=NULL;
392 }
393 if (requestBuf)
394 {
395 free(requestBuf); requestBuf=NULL;
396 }
397 //pthread_mutex_unlock(&mutexMemory);
398 cout << "3.not able to MakeSocket" << endl;
399 return -1;
400 }
401 if(sock == -2) {
402 //pthread_mutex_lock(&mutexMemory);
403 if (url)
404 {
405 free(url); url=NULL;
406 }
407 if (requestBuf)
408 {
409 free(requestBuf); requestBuf=NULL;
410 }
411 //pthread_mutex_unlock(&mutexMemory);
412 cout << "4.not able to MakeSocket" << endl;
413 return -1;
414 }
415 if(write(sock, requestBuf, strlen(requestBuf)) == -1){
416 //pthread_mutex_lock(&mutexMemory);
417 if (url)
418 {
419 free(url); url=NULL;
420 }
421 if (requestBuf)
422 {
423 free(requestBuf); requestBuf=NULL;
424 }
425 //pthread_mutex_unlock(&mutexMemory);
426 close(sock);
427 *nPSock = -1;
428 cout << "write error" << endl;
429 return -1;
430 }
431 }
432
433 //pthread_mutex_lock(&mutexMemory);
434 if (url)
435 {
436 free(url); url=NULL;
437 }
438 if (requestBuf)
439 {
440 free(requestBuf); requestBuf=NULL;
441 }
442 //pthread_mutex_unlock(&mutexMemory);
443
444
445 char headerBuf[HEADER_BUF_SIZE];
446 /* Grab enough of the response to get the metadata */
447 memset( headerBuf,0,HEADER_BUF_SIZE );
448 //cout << "old sock is " << sock << endl;
449 ret = read_header(sock, headerBuf);
450 //cout << "ret = " << ret << endl;
451 if(ret < 0) {
452 close(sock);
453 *nPSock = -1;
454 return -1;
455 }
456
457 //cout << headerBuf << endl;
458 if( strlen(headerBuf) == 0 ){
459 cout << "strlen(headerBuf) = 0" << headerBuf << endl;
460 cout << "strUrl: " << strUrl << endl << endl;;
461 close(sock);
462 *nPSock = -1;
463 return -1;
464 }
465
466
467
468 //解析网页头信息
469 CPage iPage;
470 iPage.ParseHeaderInfo(headerBuf);
471 if (iPage.m_nStatusCode == -1)
472 {
473 close(sock);
474 *nPSock = -1;
475 cout << "headerBuf: " << headerBuf << endl;
476 cout << "!header error: not find HTTP" << endl;
477 return -1;
478 }
479
480
481
482 // deal with http://net.cs.pku.edu.cn/~cnds
483 if (iPage.m_nStatusCode == 301 || iPage.m_nStatusCode == 302)
484 {
485 if (iPage.m_sLocation.empty() || iPage.m_sLocation.size()>URL_LEN)
486 {
487 close(sock);
488 *nPSock = -1;
489 cout << headerBuf << endl;
490 cout << "!error: Location" << endl;
491 return -1;
492 }
493 else
494 {
495 //pthread_mutex_lock(&mutexMemory);
496 char *loc=strdup(iPage.m_sLocation.c_str());
497 //pthread_mutex_unlock(&mutexMemory);
498 *location = loc;
499 close(sock);
500 *nPSock = -1;
501 return -300;//重定向了
502 }
503 }
504
505 if(iPage.m_nStatusCode<200 || iPage.m_nStatusCode>299 ){
506 close(sock);
507 *nPSock = -1;
508 cout << "!header code = " << iPage.m_nStatusCode << endl;
509 return -1;
510 }
511
512 // when crawling images for ImgSE, remember to comment the paragraph
513 // when crawling plain text for SE, remember to open the paragraph
514 // paragraph begin
515 if( iPage.m_sContentType.find("image") != string::npos )
516 { //
517 close(sock);
518 *nPSock = -1;
519 return -4;
520 }
521 // paragraph end
522
523 if (iPage.m_nContentLength == -1)
524 {
525 close(sock);
526 *nPSock = -1;
527 cout << headerBuf << endl;
528 cout << "!error: Content-length" << endl;
529 return -1;
530 }
531
532 if (iPage.m_nContentLength==0 || iPage.m_nContentLength<20)
533 { // Allocate enough memory to hold the page
534 iPage.m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
535 }
536
537
538 if (iPage.m_nContentLength > MAX_PAGE_BUF_SIZE)
539 {
540 cout<<"这个网页的长度大于5M,我过滤掉它!"<<endl;
541 cout << "the page discarded due to its size "
542 << iPage.m_nContentLength
543 << " is larger than " << MAX_PAGE_BUF_SIZE << endl;
544 close(sock);
545 *nPSock = -1;
546 return -1;
547 }
548
549 //pthread_mutex_lock(&mutexMemory);
550 pageBuf = (char *)malloc(iPage.m_nContentLength);
551 //pthread_mutex_unlock(&mutexMemory);
552 if(pageBuf == NULL){
553 close(sock);
554 *nPSock = -1;
555 cout << "malloc for pageBuf" << endl;
556 return -1;
557 }
558
559 /* Begin reading the body of the file */
560 //开始读取网页体信息
561 fd_set rfds;
562 struct timeval tv;
563 int flags;
564 //将sock套接子文件描述符设置为非阻塞的方式
565 flags=fcntl(sock,F_GETFL,0);
566 if(flags<0)
567 {
568 close(sock);
569 *nPSock = -1;
570 if (pageBuf)
571 {
572 //pthread_mutex_lock(&mutexMemory);
573 free(pageBuf);
574 pageBuf=NULL;
575 //pthread_mutex_unlock(&mutexMemory);
576 }
577 cout << "1.fcntl() error " << endl;
578 return -1;
579 }
580
581
582 flags|=O_NONBLOCK;
583 if(fcntl(sock,F_SETFL,flags)<0){
584 close(sock);
585 *nPSock = -1;
586 if (pageBuf)
587 {
588 free(pageBuf); pageBuf=NULL;
589 }
590 cout << "2.fcntl() error " << endl;
591 return -1;
592 }
593
594
595 //挂一个while()循环读取网页体信息
596 int pre_ret=0;
597 while(ret > 0)
598 {
599 FD_ZERO(&rfds);//清理rfds读文件描述符集合
600 FD_SET(sock, &rfds);//将sock加到rfds读文件描述符集合中
601 if( bytesRead == iPage.m_nContentLength )
602 {
603 tv.tv_sec = 1;
604 }
605 else
606 {
607 tv.tv_sec = timeout;
608 }
609 tv.tv_usec = 0;
610
611 if(DEFAULT_TIMEOUT >= 0)
612 selectRet = select(sock+1, &rfds, NULL, NULL, &tv);//IO复用
613 else /* No timeout, can block indefinately */
614 selectRet = select(sock+1, &rfds, NULL, NULL, NULL);
615
616 if(selectRet == 0 && timeout < 0)//超时
617 {
618 close(sock);
619 *nPSock = -1;
620 if (pageBuf)
621 {
622 //pthread_mutex_lock(&mutexMemory);
623 free(pageBuf);
624 pageBuf=NULL;
625 //pthread_mutex_unlock(&mutexMemory);
626 }
627 cout << "selectRet == 0 && timeout < 0" << endl;
628 return -1;
629 }
630 else if(selectRet == -1)//select()函数出错
631 {
632 close(sock);
633 *nPSock = -1;
634 if (pageBuf)
635 {
636 free(pageBuf);
637 pageBuf=NULL;
638 }
639 cout << "selectRet == -1" << endl;
640 return -1;
641 }
642
643 //每次最多接收iPage.m_nContentLength字节--缓冲区的大小为iPage.m_nContentLength
644 ret = read(sock, pageBuf + bytesRead, iPage.m_nContentLength);
645 //ret = read(sock, (char*)pageBuf.c_str() + bytesRead, iPage.m_nContentLength);
646
647 if(ret == 0) break;
648 if(ret == -1 && pre_ret==0)//read()函数出错
649 {
650 close(sock);
651 *nPSock = -1;
652 if (pageBuf)
653 {
654 //pthread_mutex_lock(&mutexMemory);
655 free(pageBuf); pageBuf=NULL;
656 //pthread_mutex_unlock(&mutexMemory);
657 }
658 cout << "read()'s retval=-1" << endl;
659 return -1;
660 }
661 else if( ret == -1 && pre_ret )
662 {
663 //cout << "2. pre_ret = " << pre_ret << endl;
664 /*
665 if( bytesRead < iPage.m_nContentLength){ // meaning we lost the connection too soon
666 cout << "lost the connection too soon" << endl;
667 freeOpageBuf);
668 return -1;
669 }
670 */
671 break;
672 }
673
674 pre_ret = ret;
675 //cout << "1.pre_ret = " << pre_ret << endl;
676
677 bytesRead += ret;
678
679
680 /* To be tolerant of inaccurate Content-Length fields, we'll
681 * allocate another read-sized chunk to make sure we have
682 * enough room.
683 */
684 if(ret > 0) {
685 //pthread_mutex_lock(&mutexMemory);
686 pageBuf = (char *)realloc(pageBuf, bytesRead + iPage.m_nContentLength);
687 //pthread_mutex_unlock(&mutexMemory);
688 if(pageBuf == NULL) {
689 close(sock);
690 *nPSock = -1;
691 if (pageBuf)
692 {
693 //pthread_mutex_lock(&mutexMemory);
694 free(pageBuf); pageBuf=NULL;
695 //pthread_mutex_unlock(&mutexMemory);
696 }
697 cout << "realloc()" << endl;
698 return -1;
699 }
700 }
701
702 }
703
704 /*
705 * The download buffer is too large. Trim off the safety padding.
706 */
707
708 //pthread_mutex_lock(&mutexMemory);
709 pageBuf = (char *)realloc(pageBuf, bytesRead+1);
710 //pthread_mutex_unlock(&mutexMemory);
711 if(pageBuf == NULL){
712 close(sock);
713 *nPSock = -1;
714 if (pageBuf)
715 {
716 //pthread_mutex_lock(&mutexMemory);
717 free(pageBuf); pageBuf=NULL;
718 //pthread_mutex_unlock(&mutexMemory);
719 }
720 cout << "2.realloc()" << endl;
721 return -1;
722 }
723
724
725 pageBuf[bytesRead] = '\0';
726
727
728 if(fileBuf == NULL){ /* They just wanted us to "hit" the url */
729 if (pageBuf)
730 {
731 //pthread_mutex_lock(&mutexMemory);
732 free(pageBuf); pageBuf=NULL;
733 //pthread_mutex_unlock(&mutexMemory);
734 }
735 }else{
736
737
738
739 char *tmp;
740 //tmp = (char *)malloc(HEADER_BUF_SIZE);
741 //pthread_mutex_lock(&mutexMemory);
742 tmp = (char *)malloc(strlen(headerBuf)+1);
743 //pthread_mutex_unlock(&mutexMemory);
744 if(tmp == NULL){
745 close(sock);
746 *nPSock = -1;
747 if (pageBuf)
748 {
749 //pthread_mutex_lock(&mutexMemory);
750 free(pageBuf); pageBuf=NULL;
751 //pthread_mutex_unlock(&mutexMemory);
752 }
753 cout << "malloc() for headerBuf" << endl;
754 return -1;
755 }
756 //memcpy( tmp, headerBuf, HEADER_BUF_SIZE-1 );
757 strncpy( tmp, headerBuf, strlen(headerBuf)+1 );
758 *fileHeadBuf = tmp;
759
760 *fileBuf = pageBuf;
761 }
762
763 //close(sock);
764 *nPSock = sock;
765 return bytesRead;
766 }
767
768
769
770
771
772 /*
773
774 function: 创建套接字文件描述符,并且调用nonb_connect()同目标服务器进行连接
775
776 success: return sock[成功创建的套接子文件描述符]
777
778 fail: return -1 其他错误
779
780 return -2 在IP阻塞范围内
781
782 */
783 int CHttp::CreateSocket(const char *host, int port)
784 {
785 int sock; // Socket descriptor
786 struct sockaddr_in sa; // Socket address
787
788
789 unsigned long inaddr;
790 int ret;
791
792 CUrl url;
793 char *ip = url.GetIpByHost(host);//通过主机号得到IP地址
794
795 if( ip == NULL )//获得失败
796 { // gethostbyname() error in GetIpByHost()
797 //cout << "invalid host: " << host << endl;
798 return -1;
799
800 }
801 else
802 {
803 // filter ip (decide whether it is inside the ip block)
804 if( url.IsValidIp(ip) )//在IP阻塞范围内
805 {
806 // inside
807 inaddr = (unsigned long)inet_addr(ip);//将字符串IP转化为32位的网络字节序
808
809 if( inaddr == INADDR_NONE )
810 {
811 // release the buffer, be careful
812 //pthread_mutex_lock(&mutexMemory);
813 delete [] ip; ip = NULL;
814 //pthread_mutex_unlock(&mutexMemory);
815 cout << "invalid ip " << ip << endl;
816 return -1;
817 }
818
819 memcpy((char *)&sa.sin_addr, (char *)&inaddr, sizeof(inaddr));
820
821 // release the buffer, be carful
822 //pthread_mutex_lock(&mutexMemory);
823 delete [] ip; ip = NULL;
824 //pthread_mutex_unlock(&mutexMemory);
825
826 }
827 else//在IP阻塞范围外
828 { // out of ip block
829 // release the buffer, be carful
830 //pthread_mutex_lock(&mutexMemory);
831 delete [] ip; ip = NULL;
832 //pthread_mutex_unlock(&mutexMemory);
833 //cout << "out of ip block: " << host << endl;
834 return -2;
835 }
836 }
837
838
839 /* Copy host address from hostent to (server) socket address */
840 sa.sin_family = AF_INET;
841 sa.sin_port = htons(port); /* Put portnum into sockaddr */
842
843 sock = -1;
844 sock = socket(AF_INET, SOCK_STREAM, 0);//创建套接字文件描述符
845 if(sock < 0 ) //创建失败
846 {
847 cout << "socket() in CreateSocket" << endl;
848 return -1;
849 }
850
851 int optval = 1;
852 if (setsockopt (sock, SOL_SOCKET, SO_REUSEADDR,(char *)&optval, sizeof (optval)) < 0)
853 //SOL_SOCKET 通用套接字选项
854 //SO_REUSEADDR 表示允许本地地址重用
855 {
856
857 cout << "setsockopt() in CreateSocket" << endl;
858 close(sock);
859 return -1;
860 }
861
862 //ret = connect(sock, (struct sockaddr *)&sa, sizeof(sa));
863 ret = nonb_connect(sock, (struct sockaddr *)&sa, DEFAULT_TIMEOUT);
864 if(ret == -1) {
865 cout << "nonb_connect() in CreateSocket" << endl;
866 close(sock);
867 return -1;
868 }
869
870 return sock;//返回套接字文件描述符
871 }
872
873
874 /* function:通过IO复用的方法读取网页头信息
875 success: return bytesRead[网页头信息的真实长度]
876 fail: return -1
877 */
878 int CHttp::read_header(int sock, char *headerPtr)
879 {
880 fd_set rfds;//读文件描述符集合
881 struct timeval tv;
882 int bytesRead = 0, newlines = 0, ret, selectRet;
883
884 int flags;
885
886 flags=fcntl(sock,F_GETFL,0);//将sock套接子文件描述符设置为非阻塞方式
887 if(flags<0)
888 {
889 cout << "1.fcntl() in read_header()< 0" << endl;
890 return -1;
891 }
892
893 flags|=O_NONBLOCK;
894 if(fcntl(sock,F_SETFL,flags)<0)
895 {
896 cout << "2.fcntl() < 0 in read_header()" << endl;
897 return -1;
898 }
899
900 //挂一个while()循环来读取网页头信息
901 while(newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
902 {
903 FD_ZERO(&rfds);//清理读文件描述符集合
904 FD_SET(sock, &rfds);//将套接字文件描述符加到读文件描述符集合中
905 tv.tv_sec = timeout;//设置最长的等待时间
906 tv.tv_usec = 0;
907
908 if(timeout >= 0)
909 selectRet = select(sock+1, &rfds, NULL, NULL, &tv);
910 else //最一个参数设置为NULL,表示阻塞操作会一直等待,直到莫个监视的文件集合中的某个文件描述符符合返回条件
911 selectRet = select(sock+1, &rfds, NULL, NULL, NULL);
912
913 if(selectRet == 0 && timeout < 0)
914 {
915 cout << "selectRet == 0 && timeout < 0" << endl;
916 return -1;
917 }
918 else if(selectRet == -1) //select()出错
919 {
920 cout << "selectRet == 0 && timeout < 0 else" << endl;
921 return -1;
922 }
923
924 ret = read(sock, headerPtr, 1);
925 if(ret == -1)
926 {
927 cout << "!error: read() in read_header()" << endl;
928 return -1;
929 }
930
931 bytesRead++;
932
933 if(*headerPtr == '\r')
934 { /* Ignore CR */
935 /* Basically do nothing special, just don't set newlines
936 * to 0 */
937 headerPtr++;
938 continue;
939 }
940 else if(*headerPtr == '\n') /* LF is the separator */
941 newlines++;
942 else
943 newlines = 0;
944
945 headerPtr++;
946
947 }
948
949 //headerPtr -= 3; /* Snip the trailing LF's */
950 /* to be compatible with Tianwang format, we have to retain them*/
951 headerPtr -= 2;
952 *headerPtr = '\0';
953 //cout << "in it " << headerPtr << endl;
954 return bytesRead;
955 }
956
957
958
959 /*
960 function:被CreateSocket()调用,通过IO复用的方法连接目标服务器
961 success: return 0;
962 fail: return -1;
963 sockfd: 套接子文件描述符
964 sa: 服务器套接子地址结构
965 sec: 最长的等待时间
966 */
967 int CHttp::nonb_connect(int sockfd,struct sockaddr* sa,int sec)
968 {
969 int flags;
970 int status;
971 fd_set mask;//写文件描述符集合
972 struct timeval timeout;
973
974 //set the socket as nonblocking
975 flags=fcntl(sockfd,F_GETFL,0);//将套接子文件描述符设置为非阻塞方式
976
977 if(flags<0) return -1;
978 flags|=O_NONBLOCK;//设置非阻塞方式
979 if(fcntl(sockfd,F_SETFL,flags) < 0)
980 {
981 cout << "1.fcntl() in nonb_connect" << endl;
982 return -1;
983 }
984
985 if( connect(sockfd,sa,sizeof(struct sockaddr)) == 0)//立刻连接上了
986 {
987 flags&=~O_NONBLOCK;//因为上面已经设置了非阻塞方式,所以我们这里有必要重新设置阻塞方式--相当于复位
988 fcntl(sockfd,F_SETFL,flags);
989 return sockfd;//connected immediately
990 }
991
992 FD_ZERO(&mask);//清理写文件描述符集合mask
993 FD_SET(sockfd,&mask);//将sockfd套接字文件描述符加到文件描述符集合mask中
994 timeout.tv_sec=sec;//设置最长的等待时间
995 timeout.tv_usec=0;
996 status=select(sockfd+1,NULL,&mask,NULL,&timeout);//IO复用
997
998 switch(status){
999 case -1: // Select error, set the socket as default blocking //select()出错
1000 flags&=~O_NONBLOCK;
1001 fcntl(sockfd,F_SETFL,flags);
1002 cout << "2.fcntl() in nonb_connect" << endl;
1003 return -1;
1004 case 0: //Connection timed out.//连接超时
1005 flags&=~O_NONBLOCK;
1006 fcntl(sockfd,F_SETFL,flags);
1007 cout << "3.fcntl() in nonb_connect" << endl;
1008 return -1;
1009 default: // Connected successfully.//连接成功
1010 FD_CLR(sockfd,&mask);
1011 flags&=~O_NONBLOCK;
1012 fcntl(sockfd,F_SETFL,flags);
1013 return 0;
1014 }
1015 }
1016
1017 /*
1018 function: 检测*buf所指的内存空间剩余值是否大于more,不过再加more+1单位的内存空间
1019 success: return 0;
1020 fail: return -1;
1021 */
1022 int CHttp::checkBufSize(char **buf, int *bufsize, int more)
1023 {
1024 char *tmp;
1025 int roomLeft = *bufsize - (strlen(*buf) + 1);//*buf内存空间的剩余值
1026
1027 if(roomLeft > more) return 0;//剩余值大于more返回0
1028
1029 //pthread_mutex_lock(&mutexMemory);
1030 tmp = (char *)realloc(*buf, *bufsize + more + 1);//剩余值不够more,这个时候我们要调整内存空间的长度,长度加more+1
1031 //pthread_mutex_unlock(&mutexMemory);
1032 if(tmp == NULL) return -1;//没有调整成功返回-1
1033
1034 *buf = tmp;
1035 *bufsize += more + 1;
1036 return 0;//调整成功
1037 }