网页抓取总结(一)

今天在公司学了网页抓取,感觉在学校C就学了个皮毛,到了公司啥都不懂。做个简单的总结

1、建工程不在一个文件夹,调用函数时,写的头文件要带路径,如#include "lyPublic/lyCodeConvert.c"

2、窗口事件要修改 工程-设置-连接的“/subsystem:console /incremental:yes” ,改为“/subsystem:windows /incremental:yes”

3、在抓取网页时,传递的网址,UTF-8和GBK之间的转换。要将GBK转为UTF-8后再打开,不然会丢失关键词

今天成果:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "lyGetHttpResult.h"
#include "lyPublic/lyCodeConvert.c"
int main()
{
    char szUrl[512] = "";
    char svData[1024 * 40] = "";
    char *szData = NULL;
    FILE *fp;
    char *p, *q;
    char strFrom[100]="",strTo[100]="";
    int len, falg;
    sprintf(szUrl, "http://www.chazidian.com/jinyicidaquan/");
    szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
    if(!szData)
        return NULL;

    CodeConvert(szData, svData, sizeof(svData), 1);
//    puts(svData);
/*    if(fopen("Text.txt", "r+") == NULL)
        fp=fopen("Text.txt", "w+r");
    else
        fp=fopen("Text.txt", "r+");
    fputs(svData, fp);*/
    gets(strFrom);
    while(strstr(svData, strFrom) == NULL)//判断是否在本页,不在的话进入下一页
    {
        p = strstr(svData, "下一页");    
        q = p-60;
        memset(szUrl, 0, sizeof(szUrl));
        len = 0;
        while(q++ < p)
            szUrl[len++]=*q;
        szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
        CodeConvert(szData, svData, sizeof(svData), 1);
        //HanziToAnsi(szData, sizeof(szData),svData,sizeof(svData));
    }

    p = strstr(svData, strFrom);//找到起点
    falg = 0;//标记是前词还是后词
    if(*(p-1)=='/')
    {
        q = p - 1;
    }
    else
    {
        q = p - 1;
        while(*q!='/')
        {
            q--;
        }
        p=q+1;
        falg = 1;//标记为后词
    }

    while(*q!='"')
        q--;

    memset(szUrl, 0, sizeof(szUrl));
    len = 0;
    while(++q < p)
        szUrl[len++] = *q;
    if(!falg)
    {
        CodeConvert(strFrom, strTo, sizeof(strTo), 2);
        strcat(szUrl, strTo);
    }
    puts(szUrl);
    szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
    CodeConvert(szData, svData, sizeof(svData), 1);//转码
//    HanziToAnsi(szData, sizeof(szData),svData,sizeof(svData));
    puts(svData);

    if(fopen("Text.txt", "r+") == NULL)
        fp=fopen("Text.txt", "w+r");
    else
        fp=fopen("Text.txt", "r+");
    fputs(svData, fp);


    free(szData);
    szData = NULL;
    return 1;
}

 优化1.2版:

 1 #include <stdlib.h>
 2 #include <stdio.h>
 3 #include <string.h>
 4 #include "lyGetHttpResult.h"
 5 #include "lyPublic/lyCodeConvert.c"
 6 int main()
 7 {
 8 
 9     char szUrl[512] = "";
10     char svData[1024 * 40] = "";
11     char *szData = NULL;
12 //    FILE *fp;
13 //    char *p, *q,*q2,*p2;
14     char *p;
15     char strFrom[100] = "", strTo[100] = "";
16     char findStr[20] = "", andStr[20] = "</span> - ";//查找标记串
17     char outStr[100] = "",reStr[100] = "";
18     char str[100] = "",str2[100] = "";
19     int len;
20     while(gets(strFrom))
21     {                //初串
22         memset(str,0,sizeof(str));
23         memset(reStr,0,sizeof(reStr));
24         memset(str2,0,sizeof(str2));
25         memset(findStr,0,sizeof(findStr));
26         memset(strTo,0,sizeof(strTo));
27         strcpy(str,"http://www.chazidian.com/jinyici/");
28         strcpy(reStr,strFrom);
29         CodeConvert(strFrom, str2, sizeof(str2), 2);//先将汉字GBK转为UTF-8再接道网址后面
30         strcat(str,str2);
31 
32         sprintf(szUrl, str);
33         szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
34         if(!szData)
35             return NULL;
36 
37         CodeConvert(szData, svData, sizeof(svData), 1);//找汉字的时候是找GBK。,所以还要转回来
38         /*    if(fopen("Text.txt", "r+") == NULL)
39                     fp=fopen("Text.txt", "w+r");
40                 else
41                     fp=fopen("Text.txt", "r+");
42                 fputs(svData, fp);*/
43         strcpy(findStr,strFrom);//
44         strcat(findStr,andStr);
45         p = strstr(svData, findStr);
46         len = strlen(outStr);
47         while(*p != '\n') ///有雨原网页的特点,设置为遇到回车结束
48         {
49             if(*p != '<' && (*p < 'a'||*p > 'z') && *p != '/' && *p != '>' && *p != '-')
50             {
51                 outStr[len++] = *p;
52             }
53             p++;
54         }
55         puts(outStr);
56 
57         p = strstr(outStr,reStr);//去重
58         len = strlen(reStr);
59         p+=len+2;
60         printf("%s\n",p);
61         memset(strFrom,0,sizeof(strFrom));
62         memset(outStr,0,sizeof(outStr));
63         free(szData);
64         szData = NULL;
65     }
66     return 1;
67 }
View Code

 

posted @ 2013-07-16 20:16  煮人为乐  阅读(306)  评论(0编辑  收藏  举报