1.首先将停用词读入ArrayList中:
代码
public ArrayList<String> stopword() {
ArrayList<String> myList1 = new ArrayList<String>();
try {
File myFile = new File("stopWord.txt");
FileReader filereader = new FileReader(myFile);
BufferedReader reader = new BufferedReader(filereader);
String line = null;
while ((line = reader.readLine()) != null) {
myList1.add(line);
}
reader.close();
} catch (Exception ex) {
ex.printStackTrace();
}
return myList1;
}
2.将文章中停用词去掉,剩下待统计tfidf的term:
代码
public ArrayList<String> delete(String filename) {
ArrayList<String> myList = new ArrayList<String>();
ArrayList<String> myList1 = stopword();
String token;
try {
File myFile = new File(filename);
FileReader filereader = new FileReader(myFile);
BufferedReader reader = new BufferedReader(filereader);
while ((token = reader.readLine()) != null) {
if (!myList1.contains(token))
myList.add(token);
}
reader.close();
} catch (Exception ex) {
ex.printStackTrace();
}
return myList;
}
3.计算每个词的tfidf值:
代码
public HashMap<String, Double> showtfidf(ArrayList<String> filList) {
ArrayList<Double> tfList = new ArrayList<Double>();
ArrayList<Double> myList2 = new ArrayList<Double>();
HashMap<String, Integer> fMap = new HashMap<String, Integer>();
HashMap<String, Double> tfidfMap = new HashMap<String, Double>();
HashMap<String, Double> tfidftopMap = new HashMap<String, Double>();
for (String t : filList) {
if (fMap.containsKey(t)) {
fMap.put(t, (fMap.get(t) + 1));
} else {
fMap.put(t, 1);
}
}
for (String d : fMap.keySet()) {
double tf = (double) fMap.get(d) / filList.size();
double idf = showidf(d);
double tfidf = tf + idf;
tfidfMap.put(d, tfidf);
tfList.add(tfidf);
}
Collections.sort(tfList);
for (int i = tfList.size() - 1; i > tfList.size() - 3; i--) {
myList2.add(tfList.get(i));
}
for (String r : tfidfMap.keySet()) {
if (myList2.contains(tfidfMap.get(r))) {
tfidftopMap.put(r, tfidfMap.get(r));
}
}
return tfidftopMap;
}
public double showidf(String d) {
Filter f = new Filter();
ArrayList<ArrayList> all = f.showall();
int i = 0;
double idf = 0;
for (ArrayList a : all) {
if (a.contains(d)) {
i++;
idf = (double) all.size() / i;
}
}
return idf;
}
待续--

浙公网安备 33010602011771号