Mahout文本聚类学习之DictionaryVectorizer类(3)

  有了词典后就可以对分词后的文档做向量化操作了,它的实现很独特,通过一个循环分别对每个词典的trunk做统计。其核心函数为makePartialVectors(),通过这个函数来运行一个MapReduce job .

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

  Mapper类只是简单的把tokenized-documents中的序列文件分片后将记录读出,没有做额外的操作,而Reducer任务中会对每个记录进行处理,具体的实现在TFPartialVectorReducer类中。

 protected void reduce(Text key, Iterable<StringTuple> values, Context context)
          throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
      return;
    }
    StringTuple value = it.next();
    //针对于每一个dictionary trunk会生成一个向量,这个向量的特点是可以快速的进行随机访问
    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);
      try {
        do {
          String term = sf.getAttribute(CharTermAttribute.class).toString();
          if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
            int termId = dictionary.get(term);
            vector.setQuick(termId, vector.getQuick(termId) + 1);
          }
        } while (sf.incrementToken());

        sf.end();
      } finally {
        Closeables.closeQuietly(sf);
      }
    } else {
      for (String term : value.getEntries()) {
        if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
            //在这里Text对象变成了int
          int termId = dictionary.get(term);
          //对每个文档中的词频进行统计正规化在下一个阶段中完成
          vector.setQuick(termId, vector.getQuick(termId) + 1);
        }
      }
    }
    if (sequentialAccess) {
      vector = new SequentialAccessSparseVector(vector);
    }
    //如果要设置这个变量的话在读写输出的SeqenceFile时就应该将vector向NameVector转化得到标题
    if (namedVector) {
      vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
      VectorWritable vectorWritable = new VectorWritable(vector);
      context.write(key, vectorWritable);
    } else {
      context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1);
    }
  }

  @Override
  protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
    URI[] localFiles = DistributedCache.getCacheFiles(conf);
    Preconditions.checkArgument(localFiles != null && localFiles.length >= 1,
            "missing paths from the DistributedCache");

    dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
    sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
    //假如要保留文本原来的标题就要设置这个选项
    namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
    maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);

    Path dictionaryFile = new Path(localFiles[0].getPath());
    // 把词典中的word到id的映射加到dictionary对象中来
    for (Pair<Writable, IntWritable> record
            : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
      dictionary.put(record.getFirst().toString(), record.getSecond().get());
    }
  }

经过了词典块数次的迭代后会产生几个不同的向量集,它们分别代表了同一文档的不同维度,下一步就要把它们直接合并起来了。

posted @ 2012-09-27 19:12  answer0107  阅读(125)  评论(0)    收藏  举报