某查查企业爬虫(模拟人工验证码)

1、.net core 3.1 C#  selenium 

//爬取所有省份、城市、区县
public override async Task WriteAreaToFileAsync(string configPath, string directory)
        {
            const string BaseProvinceLink = "https://www.qcc.com/search?key={keyword}#industrycode:K&";
            const string BaseCityLink = "https://www.qcc.com/search_getCityListHtml?province={0}";
            const string BaseCountyLink = "https://www.qcc.com/search_getCountyListHtml?city={0}";
            List<string> provinces = await GetCodeAsync(new Uri(BaseProvinceLink), ".sfilter-tag.clearfix.provinceChoose dd a");
            const string baseText = "province:{0}&city:{1}&county:{2}&";
            List<string> list = new List<string>();
            foreach (var province in provinces)
            {
                StringBuilder.Clear();
                Uri provinceUri = new Uri(StringBuilder.AppendFormat(BaseCityLink, province).ToString());
                var cities = await GetCodeAsync(provinceUri, "dd a");
                foreach (var city in cities)
                {
                    StringBuilder.Clear();
                    Uri cityUri = new Uri(StringBuilder.AppendFormat(BaseCountyLink, city).ToString());
                    var counties = await GetCodeAsync(cityUri, "dd a");
                    foreach (var county in counties)
                    {
                        StringBuilder.Clear();
                        StringBuilder.Append(BaseProvinceLink);
                        string area = StringBuilder.AppendFormat(baseText, province, city, county).
                            Replace("search", "search_index").Replace("中介#", "中介&ajaxflag=1&")
                            .Replace(":industrycode", "=industrycode").ToString();
                        list.Add(area);
                    }
                }
            }
            await File.WriteAllLinesAsync("企查查.txt", list);
        }

//分页爬取企业信息
        private async Task<bool> GetAgentsAsync(Uri cityUri)
        {
            LogHelper.Info(cityUri.ToString());
            var pageSource = await HttpClient.GetStringAsync(cityUri);
            while (!pageSource.Contains("查企业"))
            {
                if (pageSource.StartsWith("<script>window.location"))
                {
                    VertifyCode(new Uri(pageSource.Split("'")[1]));
                    pageSource = await HttpClient.GetStringAsync(cityUri);
                }
                else if (pageSource.Contains("小查还没找到数据"))
                {
                    return false;
                }
            }
            var block = JumonyParser.Parse(pageSource).Find(".m_srchList tbody tr td:nth-child(3)");
            foreach (var item in block)
            {
                await VertifyAsync(item.InnerHtml());
            }
            if (block.Count() < PageSize)
            {
                return false;
            }
            return true;
        }

2、结果截图

3、需要开通vip账号

4、过滑动验证码

posted @ 2020-09-03 12:47  Zdelta  阅读(42)  评论(0编辑  收藏  举报