1 #ifndef _URL_H_030728_
2 #define _URL_H_030728_
3
4 #include <string>
5
6 const unsigned int URL_LEN = 256;
7 const unsigned int HOST_LEN = 256;
8
9 using namespace std;
10
11
12 enum url_scheme {
13 SCHEME_HTTP,
14 SCHEME_FTP,
15 SCHEME_INVALID
16 };
17
18 const int DEFAULT_HTTP_PORT = 80;
19 const int DEFAULT_FTP_PORT = 21;
20
21 class CUrl
22 {
23 public:
24 string m_sUrl; // 原始的url地址
25 enum url_scheme m_eScheme; // URL 类型
26
27 string m_sHost; // 提取出来的主机地址
28 int m_nPort; // 主机端口号
29 string m_sPath; //路径
30
31
32 public:
33 CUrl();
34 ~CUrl();
35
36 //bool ParseUrl(string strUrl);
37
38 // break an URL into scheme, host, port and request.
39 // result as member variants
40 bool ParseUrlEx(string strUrl);
41
42 // break an URL into scheme, host, port and request.
43 // result url as argvs
44 void ParseUrlEx(const char *url, char *protocol, int lprotocol,
45 char *host, int lhost,
46 char *request, int lrequest, int *port);
47
48 // get the ip address by host name
49 char *GetIpByHost(const char *host);
50
51 bool IsValidHost(const char *ip);
52 bool IsForeignHost(string host);
53 bool IsImageUrl(string url);
54 bool IsValidIp(const char *ip);
55 bool IsVisitedUrl(const char *url);
56 bool IsUnReachedUrl(const char *url);
57 bool IsValidHostChar(char ch);
58
59 //private:
60 void ParseScheme (const char *url);
61 };
62
63 extern pthread_mutex_t mutexMemory;
64
65 #endif /* _URL_H_030728_ */
1 /* URL handling
2 */
3
4 #include <iostream>
5 #include <string.h>
6 #include <sys/socket.h>
7 #include <netdb.h>
8 #include <map>
9 #include "Url.h"
10 #include <stdlib.h>
11 #include <arpa/inet.h>
12
13 //#include "Tse.h"
14 //#include "Url.h"
15 //#include "Http.h"
16 //#include "Md5.h"
17 //#include "StrFun.h"
18
19
20
21 //
22 ///* Is X "."? */
23 #define DOTP(x) ((*(x) == '.') && (!*(sdfx + 1)))
24 ///* Is X ".."? */
25 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
26
27 map<string,string> mapCacheHostLookup;
28 //extern vector<string> vsUnreachHost;
29 //pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;
30 //extern set<string> setVisitedUrlMD5;
31 //extern map<unsigned long,unsigned long> mapIpBlock;
32 typedef map<string,string>::value_type valTypeCHL;
33
34 struct scheme_data
35 {
36 const char *leading_string;
37 int default_port;
38 int enabled;
39 };
40
41 /* 支持的网页类型 */
42 static struct scheme_data supported_schemes[] =
43 {
44 { "http://", DEFAULT_HTTP_PORT, 1 },
45 { "ftp://", DEFAULT_FTP_PORT, 1 },
46
47 /* 不合法的网页 */
48 { NULL, -1, 0 }
49 };
50
51 /* 分析类型,若是合法则返回正确的,否则是不合法的 */
52 void CUrl::ParseScheme (const char *url)
53 {
54 int i;
55
56 for (i = 0; supported_schemes[i].leading_string; i++)
57 if (0 == strncasecmp (url, supported_schemes[i].leading_string,strlen (supported_schemes[i].leading_string)))//比较url的前几个字母
58 {
59 if (supported_schemes[i].enabled)
60 {
61 this->m_eScheme = (enum url_scheme) i;
62 return;
63 }
64 else
65 {
66 this->m_eScheme = SCHEME_INVALID;
67 return;
68 }
69 }
70
71 this->m_eScheme = SCHEME_INVALID;
72 return;
73 }
74
75 /************************************************************************
76 * Function name: ParseUrlEx
77 * Input argv:
78 * -- strUrl: url
79 * Output argv:
80 * --
81 * Return:
82 true: success
83 false: fail
84 * Fucntion Description: break an URL into scheme, host, port and request.
85 * result as member variants
86 * Be careful: release the memory by the client
87 ************************************************************************/
88
89 bool CUrl::ParseUrlEx(string strUrl)
90 {
91 char protocol[10];
92 char host[HOST_LEN];
93 char request[256];
94 int port = -1;
95
96 memset( protocol, 0, sizeof(protocol) );
97 memset( host, 0, sizeof(host) );
98 memset( request, 0, sizeof(request) );
99
100 this->ParseScheme(strUrl.c_str());
101 if( this->m_eScheme != SCHEME_HTTP )
102 {
103 return false;
104 }
105
106 ParseUrlEx(strUrl.c_str(),
107 protocol, sizeof(protocol),
108 host, sizeof(host),
109 request, sizeof(request),
110 &port);
111
112 m_sUrl = strUrl;
113 m_sHost = host;
114 m_sPath = request;
115
116 if( port > 0 ){
117 m_nPort = port;
118 }
119
120 return true;
121 }
122
123 /************************************************************************
124 * Function name: ParseUrlEx
125 * Input argv:
126 * -- url: host name
127 * -- protocol: result protocol
128 * -- lprotocol: protocol length
129 * -- host: result host
130 * -- lhost: host length
131 * -- request: result request
132 * -- lrequest: request length
133 * Output argv:
134 * --
135 * Return:
136 true: success
137 false: fail
138 * Fucntion Description: break an URL into scheme, host, port and request.
139 * result as argvs
140 * Be careful:
141 ************************************************************************/
142 void CUrl::ParseUrlEx(const char *url,
143 char *protocol, int lprotocol,
144 char *host, int lhost,
145 char *request, int lrequest,
146 int *port)
147 {
148 char *work,*ptr,*ptr2;
149
150 *protocol = *host = *request = 0;
151 *port = 80;
152
153 int len = strlen(url);
154 //pthread_mutex_lock(&mutexMemory);
155 work = new char[len + 1];
156 //pthread_mutex_unlock(&mutexMemory);
157 memset(work, 0, len+1);
158 strncpy(work, url, len);
159 //把url的内容复制到work中
160
161 // find protocol if any
162 //在work中查找:(默认的是http)
163 ptr = strchr(work, ':');
164 if( ptr != NULL )
165 {
166 *(ptr++) = 0;
167 strncpy( protocol, work, lprotocol );
168 } else {
169 strncpy( protocol, "HTTP", lprotocol );
170 ptr = work;
171 }
172
173 // skip past opening /'s
174 //调过 //
175 if( (*ptr=='/') && (*(ptr+1)=='/') )
176 ptr+=2;
177
178 // 查找主机地址
179 ptr2 = ptr;
180 while( IsValidHostChar(*ptr2) && *ptr2 )
181 ptr2++;
182 *ptr2 = 0;//保证合法的字符串
183 strncpy( host, ptr, lhost );
184
185 //查找请求的网页
186 int offset = ptr2 - work;
187 const char *pStr = url + offset;
188 strncpy( request, pStr, lrequest );
189
190 //找到主机的端口
191 ptr = strchr( host, ':' );
192 if( ptr != NULL ){
193 *ptr = 0;
194 *port = atoi(ptr+1);
195 }
196
197 //pthread_mutex_lock(&mutexMemory);
198 delete [] work;
199 //pthread_mutex_unlock(&mutexMemory);
200 work = NULL;
201 }
202
203
204
205
206
207
208 /* scheme://user:pass@host[:port]...
209 * ^
210 * We attempt to break down the URL into the components path,
211 * params, query, and fragment. They are ordered like this:
212 * scheme://host[:port][/path][;params][?query][#fragment]
213 */
214
215 /*
216 bool CUrl::ParseUrl(string strUrl)
217 {
218 string::size_type idx;
219
220 this->ParseScheme(strUrl.c_str());
221 if( this->m_eScheme != SCHEME_HTTP )
222 return false;
223
224 // get host name
225 this->m_sHost = strUrl.substr(7);
226 idx = m_sHost.find('/');
227 if(idx != string::npos){
228 m_sHost = m_sHost.substr(0,idx);
229 }
230
231 this->m_sUrl = strUrl;
232
233 return true;
234 }
235 */
236 //CUrl的构造函数
237 CUrl::CUrl()
238 {
239 this->m_sUrl = "";
240 this->m_eScheme= SCHEME_INVALID;
241
242 this->m_sHost = "";
243 this->m_nPort = DEFAULT_HTTP_PORT; //默认端口
244
245 this->m_sPath = "";
246 /*
247 this->m_sParams = "";
248 this->m_sQuery = "";
249 this->m_sFragment = "";
250
251 this->m_sDir = "";
252 this->m_sFile = "";
253
254 this->m_sUser = "";
255 this->m_sPasswd = "";
256 */
257
258 }
259
260 CUrl::~CUrl()
261 {
262
263 }
264
265
266 /****************************************************************************
267 * Function name: GetIpByHost
268 * Input argv:
269 * -- host: host name
270 * Output argv:
271 * --
272 * Return:
273 ip: sucess
274 NULL: fail
275 * Function Description: get the ip address by host name
276 * Be careful: release the memory by the client
277 ****************************************************************************/
278 //通过主机地址获得IP地址
279 char * CUrl::GetIpByHost(const char *host)
280 {
281
282 if( !host ){ // null pointer
283 return NULL;
284 cout<<"f1";
285 }
286
287 if( !IsValidHost(host) ){ // invalid host
288 return NULL;
289 cout<<"f2";
290 }
291 unsigned long inaddr = 0;
292 char *result = NULL;
293 int len = 0;
294
295
296 inaddr = (unsigned long)inet_addr( host );//将字符串IP转化为32二进制的网络字节序
297 //if ( (int)inaddr != -1){
298 if ( inaddr != INADDR_NONE)
299 { // 主机地址就是用IP地址表示的
300 len = strlen(host);
301 //pthread_mutex_lock(&mutexMemory);
302 result = new char[len+1];
303 cout<<result;
304 //pthread_mutex_unlock(&mutexMemory);
305 memset(result, 0, len+1);
306 memcpy(result, host, len);
307
308 return result;
309 }
310 else
311 {
312 //firt find from cache
313
314 map<string,string>::iterator it = mapCacheHostLookup.find(host);
315 //可以在DNS缓存中找到
316 if( it != mapCacheHostLookup.end() )
317 { //如果在cache中找到IP地址
318 const char * strHostIp;
319
320 strHostIp = (*it).second.c_str();
321
322 inaddr = (unsigned long)inet_addr( strHostIp );
323 //if ( (int)inaddr != -1){
324 if ( inaddr != INADDR_NONE )
325 {
326 len = strlen(strHostIp);
327 //pthread_mutex_lock(&mutexMemory);
328 result = new char[len+1];
329 //pthread_mutex_unlock(&mutexMemory);
330 memset( result, 0, len+1 );
331 memcpy( result, strHostIp, len );
332
333 //cout << ":)" ;
334
335 return result;
336 }
337 }
338 }
339
340 //通过上面的方法我们都没有查找,这个时候我们只能通过DNS server查找了,这种带宽的消耗是必要的!
341 struct hostent *hp; /* Host entity */
342 hp = gethostbyname(host);
343 //通过主机号或者说是域名得到hostent结构,这个结构包含主机号或者说域名的很多信息,例如我们要找的IP字符串就在其中
344 if(hp == NULL) {
345 //cout << "gethostbyname() error in GetIpByHost: " << host << endl;
346 return NULL;
347 }
348
349 // cache host lookup
350 struct in_addr in;
351
352 bcopy(*(hp->h_addr_list), (caddr_t)&in, hp->h_length);
353 /*功能:将字符串src的前n个字节复制到dest中
354 说明:bcopy不检查字符串中的空字节NULL,函数没有返回值。*/
355
356 char abuf[INET_ADDRSTRLEN];
357 if( inet_ntop(AF_INET, (void *)&in,abuf, sizeof(abuf)) == NULL )
358 {
359 cout << "inet_ntop() return error in GetIpByHost" << endl;
360 return NULL;
361
362 }
363 else
364 {
365
366 //if( mapCacheHostLookup.count(host) == 0){
367 if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end() ){
368
369 //cout << endl << host << " and " << abuf << endl;
370 mapCacheHostLookup.insert( valTypeCHL ( host, abuf));
371 //更新DNS缓存
372 //cout<<((*mapCacheHostLookup.find("home.ustc.edu.cn")).second.c_str());
373
374 }
375
376 }
377
378 // return result
379 len = strlen(abuf);
380 //pthread_mutex_lock(&mutexMemory);
381 result = new char[len + 1];
382 //pthread_mutex_unlock(&mutexMemory);
383 memset( result, 0, len+1 );
384 memcpy( result, abuf, len );
385
386 return result;
387 }
388
389 /**********************************************************************************
390 * Function name: IsValidHostChar
391 * Input argv:
392 * -- ch: the character for testing
393 * Output argv:
394 * --
395 * Return:
396 true: is valid
397 false: is invalid
398 * Function Description: test the specified character valid
399 * for a host name, i.e. A-Z or 0-9 or -.:
400 **********************************************************************************/
401 bool CUrl::IsValidHostChar(char ch)
402 {
403 return( isalpha(ch) || isdigit(ch)
404 || ch=='-' || ch=='.' || ch==':' || ch=='_');
405 }
406
407 /**********************************************************************************
408 * Function name: IsValidHost
409 * Input argv:
410 * -- ch: the character for testing
411 * Output argv:
412 * --
413 * Return:
414 true: is valid
415 false: is invalid
416 * Function Description: test the specified character valid
417 * for a host name, i.e. A-Z or 0-9 or -.:
418 * Be careful:
419 **********************************************************************************/
420 bool CUrl::IsValidHost(const char *host)
421 {
422 if( !host ){//空的主机号,我们认为是无效的主机号
423 return false;
424 }
425
426 if( strlen(host) < 6 ){ //主机号长度小于6,我们认为ieshi无效的主机号
427 return false;
428 }
429
430 char ch;
431 for(unsigned int i=0; i<strlen(host); i++){
432 ch = *(host++);
433 if( !IsValidHostChar(ch) ){
434 return false;
435 }
436 }
437
438 return true;
439 }
440
441 /**********************************************************************************
442 * Function name: IsVisitedUrl
443 * Input argv:
444 * -- url: url
445 * Output argv:
446 * --
447 * Return:
448 true: is visited
449 false: not visited
450 * Function Description: test the url visited by the MD5
451 * Be careful:
452 **********************************************************************************/
453 bool CUrl::IsVisitedUrl(const char *url)//判断该URL是否访问过
454 {
455 if( !url ){
456 return true; // if be null, we think it have been visited
457 }
458
459 CMD5 iMD5;
460 iMD5.GenerateMD5( (unsigned char*)url, strlen(url) );
461 string strDigest = iMD5.ToString();
462
463 if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {
464 return true;
465 } else {
466 return false;
467 }
468
469 }
470
471
472 /**********************************************************************************
473 * Function name: IsValidIp
474 * Input argv:
475 * -- ip: ip
476 * Output argv:
477 * --
478 * Return:
479 true: inside the ip block
480 false: outside the ip block
481 * Function Description: decide teh ip whether or not inside the ip block
482 * Be careful:
483 **********************************************************************************/
484 bool CUrl::IsValidIp(const char *ip)
485 {
486 if( ip == NULL )
487 {
488 return false;
489 }
490
491 unsigned long inaddr = (unsigned long)inet_addr(ip);
492 if( inaddr == INADDR_NONE ){//显然该IP参数不是正确的字符串IP
493 return false;
494 }
495
496 if (mapIpBlock.size() > 0) { //判断是否要过滤掉
497 map<unsigned long, unsigned long>::iterator pos;
498 for (pos = mapIpBlock.begin(); pos != mapIpBlock.end(); ++pos) {
499 unsigned long ret;
500
501 ret = inaddr & ~((*pos).second);
502 if (ret == (*pos).first) { // inside
503 return true;
504 }
505 }
506
507 // outside
508 return false;
509 }
510
511
512 // if block range is not given, we think it inside also
513 return true;
514 }
515 /*
516 * If it is, return true; otherwise false
517 * not very precise
518 */
519 bool CUrl::IsForeignHost(string host)
520 {
521 if( host.empty() ) return true;
522 if( host.size() > HOST_LEN ) return true;
523
524 unsigned long inaddr = 0;
525
526 inaddr = (unsigned long)inet_addr( host.c_str() );
527 if ( inaddr != INADDR_NONE){ // host is just ip
528 return false;
529 }
530
531 string::size_type idx = host.rfind('.');
532 string tmp;
533 if( idx != string::npos ){
534 tmp = host.substr(idx+1);
535 }
536
537 CStrFun::Str2Lower( tmp, tmp.size() );
538 const char *home_host[] ={
539 "cn","com","net","org","info",
540 "biz","tv","cc", "hk", "tw"
541 };
542
543 int home_host_num = 10;
544
545 for(int i=0; i<home_host_num; i++){
546 if( tmp == home_host[i] )
547 return false;
548 }
549
550 return true;
551 }
552
553
554 bool CUrl::IsImageUrl(string url)
555 {
556 if( url.empty() ) return false;
557 if( url.size() > HOST_LEN ) return false;
558
559 string::size_type idx = url.rfind('.');
560 string tmp;
561 if( idx != string::npos ){
562 tmp = url.substr(idx+1);
563 }
564
565 CStrFun::Str2Lower( tmp, tmp.size() );
566 const char *image_type[] ={
567 "gif","jpg","jpeg","png","bmp",
568 "tif","psd"
569 };
570
571 int image_type_num = 7;
572
573 for (int i=0; i<image_type_num; i++)
574 {
575 if( tmp == image_type[i] )
576 return true;
577 }
578
579 return false;
580 }