using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AnfleCrawler.DataAnalyzer
{
internal class Qy58 : AnalyzerBase
{
public override void Init(PageCrawler crawler)
{
base.Init(crawler);
var url = new Uri("http://qy.58.com/caohejing/pn1/?PGTID=14177711280840.45006677554920316&ClickID=1");
//http://qy.58.com/19583455460359/?PGTID=14177659184690.5166369006238447&ClickID=4
crawler.PushUrl(url, 0);
}
protected override void AnalyzeInternal(PageLandEntity current)
{
var lander = Crawler.Lander;
var pHandler = CreateContentHandler(current);
switch (current.Depth)
{
case 0:
{
pHandler.AjaxBlocks.Add(HACK);
var dom = lander.GetDocument(pHandler);
DoPerPaging(current, dom.DocumentNode, ".next");
foreach (var node in QueryNodes(dom.DocumentNode, ".compList a"))
{
var url = GetHref(node, current.Url);
Crawler.PushUrl(url, 1);
}
}
break;
case 1:
{
var dom = lander.GetDocument(pHandler);
var attr = new AttributeFiller();
attr.Append("Name:{0}", QueryTexts(dom.DocumentNode, ".compT").First());
foreach (var th in QueryNodes(dom.DocumentNode, ".basicMsg table th").Skip(1))
{
string sTh = th.InnerText, sTd;
switch (sTh)
{
case "联系电话":
case "邮箱":
var client = new System.Net.WebClient();
var iNode = QueryNode(th.NextSibling, "img");
byte[] imgRaw = client.DownloadData(GetHref(iNode, current.Url, attrName: "src"));
var img = new System.Drawing.Bitmap(new System.IO.MemoryStream(imgRaw));
sTd = OCR(img);
break;
case "公司地址":
sTd = QueryTexts(th.NextSibling, "span").First();
break;
default:
sTd = th.NextSibling.InnerText.HtmlTrim();
break;
}
attr.Append("{0}:{1}", sTh, sTd);
}
var bo = new CompanyEntity();
bo.City = "上海";
bo.GroupName = "漕河泾企业";
bo.PageUrl = current.Url.OriginalString;
bo.UpdateDate = DateTime.Now;
attr.FillEntity(bo, new Dictionary<string, string>()
{
{"公司性质", "Nature"},
{"公司行业", "Industry"},
{"公司规模", "Scale"},
{"联系人", "ContactPerson"},
{"企业网址", "Website"},
{"联系电话", "Tel"},
{"邮箱", "Email"},
{"公司地址", "Address"},
});
Repository.SaveCompany(bo);
Crawler.OutWrite("保存企业 {0}", bo.Name);
}
break;
}
}
}
}