最新IP地址数据库 二分逼近&二分查找 高效解析800万大数据之区域分布

最新IP地址数据库  来自 qqzeng.com 

利用二分逼近(bisection method) ,每秒500多万, 比较高效!

多语言API解析Dat 导入数据库脚本
https://github.com/zengzhan/qqzeng-ip


 

原来的顺序查找算法 效率比较低

 readonly string ipBinaryFilePath = "qqzengipdb.dat";
        readonly byte[] dataBuffer, indexBuffer;
        readonly uint[] index = new uint[256];
        readonly int dataLength;
        public IpLocation()
        {
            try
            {
                FileInfo file = new FileInfo(ipBinaryFilePath);
                dataBuffer = new byte[file.Length];
                using (var fin = new FileStream(file.FullName, FileMode.Open, FileAccess.Read))
                {
                    fin.Read(dataBuffer, 0, dataBuffer.Length);
                }                
                var offset_len = BytesToLong(dataBuffer[0], dataBuffer[1], dataBuffer[2], dataBuffer[3]); //Big Endian
                indexBuffer = new byte[offset_len];
                Array.Copy(dataBuffer, 4, indexBuffer, 0, offset_len);
                dataLength = (int)offset_len;
                for (int loop = 0; loop < 256; loop++)
                {
                    //索引 四字节  LITTLE_ENDIAN
                    index[loop] = BytesToLong(indexBuffer[loop * 4 + 3], indexBuffer[loop * 4 + 2], indexBuffer[loop * 4 + 1], indexBuffer[loop * 4]);
                }
            }
            catch { }
        }

        public string[] Find(string ip)
        {
            var ips = ip.Split('.');
            uint ip_prefix = uint.Parse(ips[0]);
            uint find_uint32 = BytesToLong(byte.Parse(ips[0]), byte.Parse(ips[1]), byte.Parse(ips[2]), byte.Parse(ips[3]));//BIG_ENDIAN
          
            // LITTLE_ENDIAN
            int max_len = 0;       
            int resultOffset =-1;
            int resultLegth =-1;
            
            uint start = index[ip_prefix] * 8 + 1024;
            if (ip_prefix != 255)
            {
                max_len = (int)index[ip_prefix + 1] * 8 + 1024;
            }
            else
            {
                max_len = (int)index[255] * 8 + 1024+1;
            }
   
            for (; start < max_len; start += 8)
            {
                // 前四位 结束ip   后三位 偏移  最后一位 内容长度 
                uint endipNum = BytesToLong(indexBuffer[start + 0], indexBuffer[start + 1], indexBuffer[start + 2], indexBuffer[start + 3]);//BIG_ENDIAN
                if (endipNum >= find_uint32)
                {
                    resultOffset =(int) BytesToLong((byte)0, indexBuffer[start + 6], indexBuffer[start + 5], indexBuffer[start + 4]);//LITTLE_ENDIAN
                    resultLegth = 0xFF & indexBuffer[start + 7];// 长度
                    break;
                }             
            }
            if (resultOffset==-1||resultLegth==-1)
            {
                return new string[] {"N/A"};
            }
            var areaBytes = new byte[resultLegth];
            Array.Copy(dataBuffer, dataLength + resultOffset - 1024, areaBytes, 0, resultLegth);
            return Encoding.UTF8.GetString(areaBytes).Split(' ');
        }

     
        private static uint BytesToLong(byte a, byte b, byte c, byte d)
        {
           
            return ((uint)a << 24) | ((uint)b << 16) | ((uint)c << 8) | (uint)d;
        }


        public static string long2IP(long longIP)
        {
            StringBuilder sb = new StringBuilder("");
            sb.Append(longIP >> 24);
            sb.Append(".");
            //将高8位置0,然后右移16为

            sb.Append((longIP & 0x00FFFFFF) >> 16);
            sb.Append(".");

            sb.Append((longIP & 0x0000FFFF) >> 8);
            sb.Append(".");
            sb.Append((longIP & 0x000000FF));

            return sb.ToString();

        }
  
    }

改进版 采用二分逼近 算法(类似二分查找,但又不同)  性能提升很大

  public string[] Find(string ip)
        {
            var ips = ip.Split('.');
            uint ip_prefix = uint.Parse(ips[0]);
            uint find_uint32 = BytesToLong(byte.Parse(ips[0]), byte.Parse(ips[1]), byte.Parse(ips[2]), byte.Parse(ips[3]));//BIG_ENDIAN
            uint max_len = 0;
            int resultOffset = -1;
            int resultLegth = -1;
            uint start = index[ip_prefix];
            if (ip_prefix != 255)
            {
                max_len = index[ip_prefix + 1];
            }
            else
            {
                max_len = index[255];
            }
            uint num = max_len - start;
            uint my_index = BinarySearch(start, max_len, find_uint32);
            start = my_index * 8 + 1024;
            resultOffset = (int)BytesToLong((byte)0, indexBuffer[start + 6], indexBuffer[start + 5], indexBuffer[start + 4]);//LITTLE_ENDIAN
            resultLegth = 0xFF & indexBuffer[start + 7];// 长度

            if (resultOffset == -1 || resultLegth == -1)
            {
                return new string[] { "N/A" };
            }
            var areaBytes = new byte[resultLegth];
            Array.Copy(dataBuffer, dataLength + resultOffset - 1024, areaBytes, 0, resultLegth);
            return Encoding.UTF8.GetString(areaBytes).Split(' ');
        }



        /// <summary>
        /// 二分逼近
        /// </summary>
        public uint BinarySearch(uint low, uint high, uint k)
        {
            uint M = 0;
            while (low <= high)
            {
                uint mid = (low + high) / 2;
                uint endipNum = GetStartIp(mid);
                if (endipNum >= k)
                {
                    M = mid; //mid有可能是解
                    high = mid - 1;
                }
                else
                    low = mid + 1;
            }
            return M;
        }

有了上面高效算法  解析出来800多万数据 也很快   再用一个简单的ling 统计一下即可

  var cn_result= from r in list
                        group r by r.cn into g
                        select new { key = g.Key, cnt = g.Count() };

 800多万数据  统计组图  

 

 

 

 

 

最新IP地址数据库 qqzeng-ip

        微信公众号:qqzeng-ip

 

    IP地址数据库: IP地址数据库

 

 

posted @ 2014-05-26 22:15  曾祥展  阅读(11178)  评论(10编辑  收藏  举报