转自:http://blog.csdn.net/huangxy10/article/details/8120106
备注:把项目属性中的字符集改成多字节集合?
1 // 网络爬虫.cpp : 定义控制台应用程序的入口点。
2 //
3
4 #include "stdafx.h"
5 /*
6
7 int _tmain(int argc, _TCHAR* argv[])
8 {
9 return 0;
10 }
11
12 */
13
14 //#include <Windows.h>
15 #include <string>
16 #include <iostream>
17 #include <fstream>
18 #include <vector>
19 #include "winsock2.h"
20 #include <time.h>
21 #include <queue>
22 #include <hash_set>
23
24 #pragma comment(lib, "ws2_32.lib")
25 using namespace std;
26
27 #define DEFAULT_PAGE_BUF_SIZE 1048576
28
29 queue<string> hrefUrl;
30 hash_set<string> visitedUrl;
31 hash_set<string> visitedImg;
32 int depth=0;
33 int g_ImgCnt=1;
34
35 //解析URL,解析出主机名,资源名
36 bool ParseURL( const string & url, string & host, string & resource){
37 if ( strlen(url.c_str()) > 2000 ) {
38 return false;
39 }
40
41 const char * pos = strstr( url.c_str(), "http://" );
42 if( pos==NULL ) pos = url.c_str();
43 else pos += strlen("http://");
44 if( strstr( pos, "/")==0 )
45 return false;
46 char pHost[100];
47 char pResource[2000];
48 sscanf( pos, "%[^/]%s", pHost, pResource );
49 host = pHost;
50 resource = pResource;
51 return true;
52 }
53
54 //使用Get请求,得到响应
55 bool GetHttpResponse( const string & url, char * &response, int &bytesRead ){
56 string host, resource;
57 if(!ParseURL( url, host, resource )){
58 cout << "Can not parse the url"<<endl;
59 return false;
60 }
61
62 //建立socket
63 struct hostent * hp= gethostbyname( host.c_str() );
64 if( hp==NULL ){
65 cout<< "Can not find host address"<<endl;
66 return false;
67 }
68
69 SOCKET sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP);
70 if( sock == -1 || sock == -2 ){
71 cout << "Can not create sock."<<endl;
72 return false;
73 }
74
75 //建立服务器地址
76 SOCKADDR_IN sa;
77 sa.sin_family = AF_INET;
78 sa.sin_port = htons( 80 );
79 //char addr[5];
80 //memcpy( addr, hp->h_addr, 4 );
81 //sa.sin_addr.s_addr = inet_addr(hp->h_addr);
82 memcpy( &sa.sin_addr, hp->h_addr, 4 );
83
84 //建立连接
85 if( 0!= connect( sock, (SOCKADDR*)&sa, sizeof(sa) ) ){
86 cout << "Can not connect: "<< url <<endl;
87 closesocket(sock);
88 return false;
89 };
90
91 //准备发送数据
92 string request = "GET " + resource + " HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n";
93
94 //发送数据
95 if( SOCKET_ERROR ==send( sock, request.c_str(), request.size(), 0 ) ){
96 cout << "send error" <<endl;
97 closesocket( sock );
98 return false;
99 }
100
101 //接收数据
102 int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
103 char *pageBuf = (char *)malloc(m_nContentLength);
104 memset(pageBuf, 0, m_nContentLength);
105
106 bytesRead = 0;
107 int ret = 1;
108 cout <<"Read: ";
109 while(ret > 0){
110 ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);
111
112 if(ret > 0)
113 {
114 bytesRead += ret;
115 }
116
117 if( m_nContentLength - bytesRead<100){
118 cout << "\nRealloc memorry"<<endl;
119 m_nContentLength *=2;
120 pageBuf = (char*)realloc( pageBuf, m_nContentLength); //重新分配内存
121 }
122 cout << ret <<" ";
123 }
124 cout <<endl;
125
126 pageBuf[bytesRead] = '\0';
127 response = pageBuf;
128 closesocket( sock );
129 return true;
130 //cout<< response <<endl;
131 }
132
133 //提取所有的URL以及图片URL
134 void HTMLParse ( string & htmlResponse, vector<string> & imgurls, const string & host ){
135 //找所有连接,加入queue中
136 const char *p= htmlResponse.c_str();
137 char *tag="href=\"";
138 const char *pos = strstr( p, tag );
139 ofstream ofile("url.txt", ios::app);
140 while( pos ){
141 pos +=strlen(tag);
142 const char * nextQ = strstr( pos, "\"" );
143 if( nextQ ){
144 char * url = new char[ nextQ-pos+1 ];
145 //char url[100]; //固定大小的会发生缓冲区溢出的危险
146 sscanf( pos, "%[^\"]", url);
147 string surl = url; // 转换成string类型,可以自动释放内存
148 if( visitedUrl.find( surl ) == visitedUrl.end() ){
149 visitedUrl.insert( surl );
150 ofile << surl<<endl;
151 hrefUrl.push( surl );
152 }
153 pos = strstr(pos, tag );
154 delete [] url; // 释放掉申请的内存
155 }
156 }
157 ofile << endl << endl;
158 ofile.close();
159
160 tag ="<img ";
161 const char* att1= "src=\"";
162 const char* att2="lazy-src=\"";
163 const char *pos0 = strstr( p, tag );
164 while( pos0 ){
165 pos0 += strlen( tag );
166 const char* pos2 = strstr( pos0, att2 );
167 if( !pos2 || pos2 > strstr( pos0, ">") ) {
168 pos = strstr( pos0, att1);
169 if(!pos) {
170 pos0 = strstr(att1, tag );
171 continue;
172 } else {
173 pos = pos + strlen(att1);
174 }
175 }
176 else {
177 pos = pos2 + strlen(att2);
178 }
179
180 const char * nextQ = strstr( pos, "\"");
181 if( nextQ ){
182 char * url = new char[nextQ-pos+1];
183 sscanf( pos, "%[^\"]", url);
184 cout << url<<endl;
185 string imgUrl = url;
186 if( visitedImg.find( imgUrl ) == visitedImg.end() ){
187 visitedImg.insert( imgUrl );
188 imgurls.push_back( imgUrl );
189 }
190 pos0 = strstr(pos0, tag );
191 delete [] url;
192 }
193 }
194 cout << "end of Parse this html"<<endl;
195 }
196
197 //把URL转化为文件名
198 string ToFileName( const string &url ){
199 string fileName;
200 fileName.resize( url.size());
201 int k=0;
202 for( int i=0; i<(int)url.size(); i++){
203 char ch = url[i];
204 if( ch!='\\'&&ch!='/'&&ch!=':'&&ch!='*'&&ch!='?'&&ch!='"'&&ch!='<'&&ch!='>'&&ch!='|')
205 fileName[k++]=ch;
206 }
207 return fileName.substr(0,k) + ".txt";
208 }
209
210 //下载图片到img文件夹
211 void DownLoadImg( vector<string> & imgurls, const string &url ){
212
213 //生成保存该url下图片的文件夹
214 string foldname = ToFileName( url );
215 foldname = "./img/"+foldname;
216 if(!CreateDirectory( (LPCSTR)foldname.c_str(),NULL ))
217 cout << "Can not create directory:"<< foldname<<endl;
218 char *image;
219 int byteRead;
220 for( int i=0; i<imgurls.size(); i++){
221 //判断是否为图片,bmp,jgp,jpeg,gif
222 string str = imgurls[i];
223 int pos = str.find_last_of(".");
224 if( pos == string::npos )
225 continue;
226 else{
227 string ext = str.substr( pos+1, str.size()-pos-1 );
228 if( ext!="bmp"&& ext!="jpg" && ext!="jpeg"&& ext!="gif"&&ext!="png")
229 continue;
230 }
231 //下载其中的内容
232 if( GetHttpResponse(imgurls[i], image, byteRead)){
233 if ( strlen(image) ==0 ) {
234 continue;
235 }
236 const char *p=image;
237 const char * pos = strstr(p,"\r\n\r\n")+strlen("\r\n\r\n");
238 int index = imgurls[i].find_last_of("/");
239 if( index!=string::npos ){
240 string imgname = imgurls[i].substr( index , imgurls[i].size() );
241 ofstream ofile( foldname+imgname, ios::binary );
242 if( !ofile.is_open() )
243 continue;
244 cout <<g_ImgCnt++<< foldname+imgname<<endl;
245 ofile.write( pos, byteRead- (pos-p) );
246 ofile.close();
247 }
248 free(image);
249 }
250 }
251 }
252
253
254
255 //广度遍历
256 void BFS( const string & url ){
257 char * response;
258 int bytes;
259 // 获取网页的相应,放入response中。
260 if( !GetHttpResponse( url, response, bytes ) ){
261 cout << "The url is wrong! ignore." << endl;
262 return;
263 }
264 string httpResponse=response;
265 free( response );
266 string filename = ToFileName( url );
267 ofstream ofile( "./html/"+filename );
268 if( ofile.is_open() ){
269 // 保存该网页的文本内容
270 ofile << httpResponse << endl;
271 ofile.close();
272 }
273 vector<string> imgurls;
274 //解析该网页的所有图片链接,放入imgurls里面
275 HTMLParse( httpResponse, imgurls, url );
276
277 //下载所有的图片资源
278 DownLoadImg( imgurls, url );
279 }
280
281 void main()
282 {
283 //初始化socket,用于tcp网络连接
284 WSADATA wsaData;
285 if( WSAStartup(MAKEWORD(2,2), &wsaData) != 0 ){
286 return;
287 }
288
289 // 创建文件夹,保存图片和网页文本文件
290 CreateDirectory((LPCSTR) "./img",0);
291 CreateDirectory((LPCSTR)"./html",0);
292 //string urlStart = "http://hao.360.cn/meinvdaohang.html";
293
294 // 遍历的起始地址
295 // string urlStart = "http://www.wmpic.me/tupian";
296 string urlStart = "http://item.taobao.com/item.htm?spm=a230r.1.14.19.sBBNbz&id=36366887850&ns=1#detail";
297
298 // 使用广度遍历
299 // 提取网页中的超链接放入hrefUrl中,提取图片链接,下载图片。
300 BFS( urlStart );
301
302 // 访问过的网址保存起来
303 visitedUrl.insert( urlStart );
304
305 while( hrefUrl.size()!=0 ){
306 string url = hrefUrl.front(); // 从队列的最开始取出一个网址
307 cout << url << endl;
308 BFS( url ); // 遍历提取出来的那个网页,找它里面的超链接网页放入hrefUrl,下载它里面的文本,图片
309 hrefUrl.pop(); // 遍历完之后,删除这个网址
310 }
311 WSACleanup();
312 return;
313 }