Mahout文本聚类学习之DictionaryVectorizer类(3)
有了词典后就可以对分词后的文档做向量化操作了,它的实现很独特,通过一个循环分别对每个词典的trunk做统计。其核心函数为makePartialVectors(),通过这个函数来运行一个MapReduce job .
job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers);
Mapper类只是简单的把tokenized-documents中的序列文件分片后将记录读出,没有做额外的操作,而Reducer任务中会对每个记录进行处理,具体的实现在TFPartialVectorReducer类中。
protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return; } StringTuple value = it.next(); //针对于每一个dictionary trunk会生成一个向量,这个向量的特点是可以快速的进行随机访问 Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.closeQuietly(sf); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram //在这里Text对象变成了int int termId = dictionary.get(term); //对每个文档中的词频进行统计正规化在下一个阶段中完成 vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } //如果要设置这个变量的话在读写输出的SeqenceFile时就应该将vector向NameVector转化得到标题 if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1); } } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); URI[] localFiles = DistributedCache.getCacheFiles(conf); Preconditions.checkArgument(localFiles != null && localFiles.length >= 1, "missing paths from the DistributedCache"); dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); //假如要保留文本原来的标题就要设置这个选项 namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize); Path dictionaryFile = new Path(localFiles[0].getPath()); // 把词典中的word到id的映射加到dictionary对象中来 for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().toString(), record.getSecond().get()); } }
经过了词典块数次的迭代后会产生几个不同的向量集,它们分别代表了同一文档的不同维度,下一步就要把它们直接合并起来了。

浙公网安备 33010602011771号