Googler

两情相悦,又岂在朝朝暮暮。

realestate.cei.gov.cn

using AnfleCrawler.Common;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace AnfleCrawler.DataAnalyzer
{
    public class ManualAnalyzer : AnalyzerBase
    {
        private ConcurrentQueue<string[]> _dict = new ConcurrentQueue<string[]>();

        public override void Init(PageCrawler crawler)
        {
            crawler.Lander.Idle += Lander_Idle;
            base.Init(crawler);

            var url = new Uri("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq=20140601&lx=w6&r1=20140830");
            var dom = Crawler.Lander.GetDocument(new PageContentHandler() { Url = url });
            foreach (var node in QueryNodes(dom.DocumentNode, "#qrq option"))
            {
                string val = node.GetAttributeValue("value", string.Empty);
                Crawler.PushUrl(new Uri(string.Format("http://www.realestate.cei.gov.cn/traden/br2.aspx?rq={0}&lx=w6&r1=20140830", val)), 1);
            }
        }
        void Lander_Idle(object sender, EventArgs e)
        {
            Crawler.OutWrite("Start step2...");
            App.LogInfo("Start step2...");
            using (var writer = new System.IO.StreamWriter(@"D:\outdict.txt", false, Encoding.UTF8))
            {
                foreach (var set in _dict)
                {
                    writer.WriteLine(string.Join(",", set));
                }
            }
        }

        protected override void AnalyzeInternal(PageLandEntity current)
        {
            Crawler.OutWrite("*Start step1...");
            var lander = Crawler.Lander;
            var pHandler = CreateContentHandler(current);
            switch (current.Depth)
            {
                case 1:
                    {
                        var query = System.Web.HttpUtility.ParseQueryString(current.Url.Query);
                        var dt = DateTime.ParseExact(query["rq"], "yyyyMMdd", null);
                        var dom = lander.GetDocument(pHandler);

                        var checkNode = QueryNode(dom.DocumentNode, "#str1");
                        if (string.IsNullOrWhiteSpace(checkNode.InnerText))
                        {
                            return;
                        }
                        checkNode.InnerHtml = checkNode.InnerHtml.Replace("<tr", "</tr><tr").Substring(5);
                        App.LogInfo("WTF CN:{0}", checkNode.InnerHtml);
                        var set = QueryNodes(checkNode, "tr");
                        foreach (var node in set)
                        {
                            var x = new List<string>();
                            x.Add(dt.ToString("yyyy-MM-dd"));
                            x.AddRange(QueryTexts(node, "td"));
                            _dict.Enqueue(x.ToArray());
                        }
                        _dict.Enqueue(new string[] { Environment.NewLine });
                        Crawler.OutWrite("#Stop step1 {0} {1}", dt.ToShortDateString(), set.Count());
                    }
                    break;
            }
        }
    }
}

 

posted on 2014-11-20 16:51  RockyLOMO  阅读(511)  评论(0编辑  收藏  举报

导航

Apple/苹果笔记本 Mac Air MC968CH/A 行货在保 I5 11寸 超级本