敏感词判断二:敏感词任意排列组合都过滤(scala版本)
val mottoWord="骂人风波胡歌" val mottoWord2="草木皆兵" val mottoWordSet=uptoNGramSeg(mottoWord,mottoWord.length) println("==mottoWordSet: "+mottoWordSet) //Set(波胡, 人风, 歌, 波, 骂人风波胡, 骂人风波, 骂, 人风波胡, 胡歌, 人风波胡歌, 风波胡, 骂人风, 风波, 人风波, 人, 风波胡歌, 风, 骂人风波胡歌, 波胡歌, 骂人, 胡) val dropWordList=List("胡歌 陷 门","胡歌 骂人","笨蛋","草").map(x=>x.split(" ").toList) val dropWordMap=toSetTree(dropWordList) println("==dropWordMap: "+dropWordMap) //Map(胡歌 -> Map(骂人 -> Map( -> ), 陷 -> Map(门 -> Map( -> ))), 笨蛋 -> Map( -> ), 草 -> Map( -> )) val isflag=testFunction(mottoWordSet,dropWordMap) println("isflag: "+isflag)//true def testFunction(set1:Set[String],map1:Map[String,Any]):Boolean ={ val intSet=set1.intersect(map1.keySet) //println("map1.keySet: "+map1.keySet) //println("intSet: "+intSet) var filterFlag=false if(intSet.nonEmpty){ for(t<-intSet.toList){ if(map1(t).equals(Map(""->""))){ filterFlag=true }else{ filterFlag=testFunction(set1,map1(t).asInstanceOf[Map[String,Any]]) } } } filterFlag //false:不包含敏感词;true:包含敏感词 } /** * 将句子切词 * @param sentence * @param senLength * @return */ def uptoNGramSeg(sentence: String, senLength: Int): Set[String] ={ try { val wordList = sentence.replaceAll(" ", "").toCharArray.toList.map(x => x.toString) //println("==wordList(切词为1个字): " + wordList) var resultSet = wordList.toSet //println("===resultSet: " + resultSet) if (senLength > 1) { for (i <- 2 to senLength) { val segWordList = wordList.sliding(i).toList.map(x => x.mkString("")) //println("切词为" + i + "个字: " + segWordList) resultSet ++= segWordList.toSet } } resultSet } catch { case e:Exception =>Set.empty[String] } } def toSetTree(keyWordList:List[List[String]]): Map[String,Any] ={ if(keyWordList!=List(List())){ keyWordList.map(x=>{ if(x.nonEmpty){ (x.head,x.drop(1)) }else{ ("",List.empty[String]) } }).filter(x=>x._1 !="").groupBy(x=> x._1).map(x=>{ val value2=x._2.flatMap(z=>Array(z._2)) (x._1,toSetTree(value2)) }) }else{ Map("" -> "") } }