将WebGraph合并为HostGraph

law实验室提供很多的webgraph,但是没有提供相应的hostgraph。所谓hostgraph就是将webgraph中在同一站点的url合成一个结点。注意到这些webgraph中在同一个站点中的url是连续的,这对我们进行合并提供了很大的方便性。本来想用java来写,但是考虑到java的io效率在windows下比较差,就用C#了。我用IKVM将webgraph.jar和其依赖的jar文件打包成webgraph.dll。

 

合并算法相对比较简单,分为两步:第一步扫描url文件,建立相应结点的对应关系。第二步,读取webgraph进行合并,并声称hostgraph。

 

代码如下:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using it.unimi.dsi.webgraph;
using org.apache.log4j;
namespace indochina_2004host
{
    class Program
    {
        static string basename = @"D:研究数据集law datasetsuk-2007-05uk-2007-05";
        static Logger logger;
        static bool offline = false;
        static Program()
        {
            Logger.getRootLogger().addAppender(new ConsoleAppender(new TTCCLayout(), ConsoleAppender.SYSTEM_OUT));
            logger = Logger.getLogger(typeof(Program));
        }
        static void Main(string[] args)
        {
            MergeUrl();
            MergeGraph();
        }
        static void MergeGraph()
        {
            logger.info("start merge graph");
            string[] maps = File.ReadAllLines(basename + ".map");
            BVGraph bg;
            logger.info("loading graph");
            if (offline)
                bg = BVGraph.loadOffline(basename);
            else
                bg = BVGraph.load(basename);
            using (StreamWriter sw = new StreamWriter(basename + ".hostgraph.graph-txt"))
            {
                NodeIterator it = bg.nodeIterator();
                // 生成一个map
                logger.info("generate map");
                int[] map = new int[bg.numNodes()];
                int[] start = new int[maps.Length];
                int[] end = new int[maps.Length];
                for (int k = 0; k < maps.Length; k ++)
                {
                    string line = maps[k];
                    string[] splits = line.Split(' ', '-');
                    int value = Convert.ToInt32(splits[2]);
                    int st = Convert.ToInt32(splits[0]);
                    int en = Convert.ToInt32(splits[1]);
                    start[k] = st;
                    end[k] = en;
                    for (int i = st; i <= en; i++)
                        map[i] = value;
                    maps[k] = null;
                }
                // 回收maps
                maps = null;
                GC.Collect();
                logger.info("map length: " + start.Length);
                sw.WriteLine(start.Length);
                // 开始合并
                logger.info("merging...");
                for (int k = 0; k < start.Length; k ++)
                {
                    logger.info("merging " + start[k] + "-" + end[k] + ": " + map[start[k]]);
                    SortedSet<int> successors = new SortedSet<int>();
                    for (int i = start[k]; i <= end[k]; i++)
                    {
                        it.nextInt();
                        LazyIntIterator lit = it.successors();
                        int j;
                        while ((j = lit.nextInt()) != -1)
                            successors.Add(j);
                    }
                    SortedSet<int> after = new SortedSet<int>();
                    foreach (int successor in successors)
                        after.Add(map[successor]);
                    after.Remove(k);
                    int[] ts = after.ToArray();
                    for (int i = 0; i < ts.Length - 1; i++)
                    {
                        sw.Write(ts[i]);
                        sw.Write(' ');
                    }
                    if (ts.Length != 0)
                        sw.WriteLine(ts[ts.Length - 1]);
                    else
                        sw.WriteLine();
                }
                sw.Flush();
            }
            logger.info("end merge graph");
        }
        static void MergeUrl()
        {
            logger.info("start merge url");
            using (StreamReader urlSr = new StreamReader(basename + ".urls"))
            using (StreamWriter hostnamesSw = new StreamWriter(basename + ".hostnames.txt"))
            using (StreamWriter mapSw = new StreamWriter(basename + ".map"))
            {
                long i = 0;
                long j = 0;
                long k = 0;
                string host = null;
                string url = null;
                while (!urlSr.EndOfStream)
                {
                    url = urlSr.ReadLine();
                    string curhost = url.ToLower().Replace("http://", "");
                    curhost = curhost.Substring(0, curhost.IndexOf('/')).Trim();
                    if (host != curhost)
                    {
                        if (host != null)
                        {
                            hostnamesSw.WriteLine(host);
                            mapSw.WriteLine(j + "-" + (i - 1) + " " + k);
                            k++;
                            j = i;
                        }
                        host = curhost;
                    }
                    i++;
                }
                hostnamesSw.WriteLine(host);
                mapSw.WriteLine(j + "-" + (i-1) + " " + k);
                hostnamesSw.Flush();
                mapSw.Flush();
            }
            logger.info("end merge url");
        }
    }
}

posted on 2010-04-27 08:29  小橋流水  阅读(307)  评论(0编辑  收藏  举报

导航