java使用Nagao算法实现新词发现、热门词的挖掘-eolink官网

java使用Nagao算法实现新词发现、热门词的挖掘

采用Nagao算法统计各个子字符串的频次，然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息（内部凝聚度）。

名词解释：

Nagao算法：一种快速的统计文本里所有子字符串频次的算法。详细算法可见http://doc88.com/p-664123446503.html

词频：该字符串在文档中出现的次数。出现次数越多越重要。

左右邻个数：文档中该字符串的左边和右边出现的不同的字的个数。左右邻越多，说明字符串成词概率越高。

左右熵：文档中该字符串的左边和右边出现的不同的字的数量分布的熵。类似上面的指标，有一定区别。

交互信息：每次将某字符串分成两部分，左半部分字符串和右半部分字符串，计算其同时出现的概率除于其各自独立出现的概率，最后取所有的划分里面概率最小值。这个值越大，说明字符串内部凝聚度越高，越可能成词。

算法具体流程：

1. 将输入文件逐行读入，按照非汉字（[^\u4E00-\u9FA5]+）以及停词“的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说”，

分成一个个字符串，代码如下：

String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");

停用词可以修改。

2. 获取所有切分后的字符串的左子串和右子串，分别加入左、右PTable

3. 对PTable排序，并计算LTable。LTable记录的是，排序后的PTable中，下一个子串同上一个子串具有相同字符的数量

4. 遍历PTable和LTable，即可得到所有子字符串的词频、左右邻

5. 根据所有子字符串的词频、左右邻结果，输出字符串的词频、左右邻个数、左右熵、交互信息

1. NagaoAlgorithm.java

package com.algo.word;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Collections;

import java.util.HashMap;

import java.util.HashSet;

import java.util.List;

import java.util.Map;

import java.util.Set;

public class NagaoAlgorithm {

private int N;

private List leftPTable;

private int[] leftLTable;

private List rightPTable;

private int[] rightLTable;

private double wordNumber;

private Map wordTFNeighbor;

private final static String stopwords = "的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说";

private NagaoAlgorithm(){

//default N = 5

N = 5;

leftPTable = new ArrayList();

rightPTable = new ArrayList();

wordTFNeighbor = new Hhttp://ashMap();

}

//reverse phrase

private String reverse(String phrase) {

StringBuilder reversePhrase = new StringBuilder();

for (int i = phrase.length() - 1; i >= 0; i--)

reversePhrase.append(phrase.charAt(i));

return reversePhrase.toString();

}

//co-prefix length of s1 and s2

private int coPrefixLength(String s1, String s2){

int coPrefixLength = 0;

for(int i = 0; i < Math.min(s1.length(), s2.length()); i++){

if(s1.charAt(i) == s2.charAt(i)) coPrefixLength++;

else break;

}

return coPrefixLength;

}

//add substring of line to pTable

private void addToPTable(String line){

//split line according to consecutive none Chinese character

String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");

for(String phrase : phrases){

for(int i = 0; i < phrase.length(); i++)

rightPTable.add(phrase.substring(i));

String reversePhrase = reverse(phrase);

for(int i = 0; i < reversePhrase.length(); i++)

leftPTable.add(reversePhrase.substring(i));

wordNumber += phrase.length();

}

//count lTable

private void countLTable(){

Collections.sort(rightPTable);

rightLTable = new int[rightPTable.size()];

for(int i = 1; i < rightPTable.size(); i++)

rightLTable[i] = coPrefixLength(rightPTable.get(i-1), rightPTable.get(i));

Collections.sort(leftPTable);

leftLTable = new int[leftPTable.size()];

for(int i = 1; i < leftPTable.size(); i++)

leftLTable[i] = coPrefixLength(leftPTable.get(i-1), leftPTable.get(i));

System.out.println("Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable");

}

//according to pTable and lTable, count statistical result: TF, neighbor distribution

private void countTFNeighbor(){

//get TF and right neighbor

for(int pIndex = 0; pIndex < rightPTable.size(); pIndex++){

String phrase = rightPTable.get(pIndex);

for(int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){

String word = phrase.substring(0, length);

TFNeighbor tfNeighbor = new TFNeighbor();

tfNeighbor.incrementTF();

if(phrase.length() > length)

tfNeighbor.addToRightNeighbor(phrase.charAt(length));

for(int lIndex = pIndex+1; lIndex < rightLTable.length; lIndex++){

if(rightLTable[lIndex] >= length){

tfNeighbor.incrementTF();

String coPhrase = rightPTable.get(lIndex);

if(coPhrase.length() > length)

tfNeighbor.addToRightNeighbor(coPhrase.charAt(length));

}

else break;

}

wordTFNeighbor.put(word, tfNeighbor);

}

//get left neighbor

for(int pIndex = 0; pIndex < leftPTable.size(); pIndex++){

String phrase = leftPTable.get(pIndex);

for(int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){

String word = reverse(phrase.substring(0, length));

TFNeighbor tfNeighbor = wordTFNeighbor.get(word);

if(phrase.length() > length)

tfNeighbor.addToLeftNeighbor(phrase.charAt(length));

for(int lIndex = pIndex + 1; lIndex < leftLTable.length; lIndex++){

if(leftLTable[lIndex] >= length){

String coPhrase = leftPTable.get(lIndex);

if(coPhrase.length() > length)

tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length));

}

else break;

}

System.out.println("Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor");

}

//according to wordTFNeighbor, count MI of word

private double countMI(String word){

if(word.length() <= 1) return 0;

double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber;

List mi = new ArrayList(word.length());

for(int pos = 1; pos < word.length(); pos++){

String leftPart = word.substring(0, pos);

String rightPart = word.substring(pos);

double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber;

double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber;

mi.add(coProbability/(leftProbability*rightProbability));

}

return Collections.min(mi);

}

//save TF, (left and right) neighbor number, neighbor entropy, mutual information

private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){

try {

//read stop words file

Set stopWords = new HashSet();

BufferedReader br = new BufferedReader(new FileReader(stopList));

String line;

while((line = br.readLine()) != null){

if(line.length() > 1)

stopWords.add(line);

}

br.close();

//output words TF, neighbor info, MI

BufferedWriter bw = new BufferedWriter(new FileWriter(out));

for(Map.Enthttp://ry entry : wordTFNeighbor.entrySet()){

if( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue;

TFNeighbor tfNeighbor = entry.getValue();

int tf, leftNeighborNumber, rightNeighborNumber;

double mi;

tf = tfNeighbor.getTF();

leftNeighborNumber = tfNeighbor.getLeftNeighborNumber();

rightNeighborNumber = tfNeighbor.getRightNeighborNumber();

mi = countMI(entry.getKey());

if(tf > Integer.parseInt(threshold[0]) && leftNeighborNumber > Integer.parseInt(threshold[1]) &&

rightNeighborNumber > Integer.parseInt(threshold[2]) && mi > Integer.parseInt(threshold[3]) ){

StringBuilder sb = new StringBuilder();

sb.append(entry.getKey());

NyacmJuo sb.append(",").append(tf);

sb.append(",").append(leftNeighborNumber);

sb.append(",").append(rightNeighborNumber);

sb.append(",").append(tfNeighbor.getLeftNeighborEntropy());

sb.append(",").append(tfNeighbor.getRightNeighborEntropy());

sb.append(",").append(mi).append("\n");

bw.write(sb.toString());

}

bw.close();

} catch (IOException e) {

throw new RuntimeException(e);

}

System.out.println("Info: [Nagao Algorithm Step 4]: having saved to file");

}

//apply nagao algorithm to input file

public static void applyNagao(String[] inputs, String out, String stopList){

NagaoAlgorithm nagao = new NagaoAlgorithm();

//step 1: add phrases to PTable

String line;

for(String in : inputs){

try {

BufferedReader br = new http://BufferedReader(new FileReader(in));

while((line = br.readLine()) != null){

nagao.addToPTable(line);

}

br.close();

} catch (IOException e) {

throw new RuntimeException();

}

System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");

//step 2: sort PTable and count LTable

nagao.countLTable();

//step3: count TF and Neighbor

nagao.countTFNeighbor();

//step4: save TF NeighborInfo and MI

nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5".split(","));

}

public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){

NagaoAlgorithm nagao = new NagaoAlgorithm();

nagao.setN(n);

String[] threshold = filter.split(",");

if(threshold.length != 4){

System.out.println("ERROR: filter must have 4 numbers, seperated with ',' ");

return;

}

//step 1: add phrases to PTable

String line;

for(String in : inputs){

try {

BufferedReader br = new BufferedReader(new FileReader(in));

while((line = br.readLine()) != null){

nagao.addToPTable(line);

}

br.close();

} catch (IOException e) {

throw new RuntimeException();

}

System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");

//step 2: sort PTable and count LTable

nagao.countLTable();

//step3: count TF and Neighbor

nagao.countTFNeighbor();

//step4: save TF NeighborInfo and MI

nagao.saveTFNeighborInfoMI(out, stopList, threshold);

}

private void setN(int n){

N = n;

}

public static void main(String[] args) {

String[] ins = {"E://test//ganfen.txt"};

applyNagao(ins, "E://test//out.txt", "E://test//stoplist.txt");

}

2. TFNeighbor.java

package com.algo.word;

import java.util.HashMap;

import java.util.Map;

public class TFNeighbor {

private int tf;

private Map leftNeighbor;

private Map rightNeighbor;

TFNeighbor(){

leftNeighbor = new HashMap();

rightNeighbor = new HashMap();

}

//add word to leftNeighbor

public void addToLeftNeighbor(char word){

//leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0));

Integer number = leftNeighbor.get(word);

leftNeighbor.put(word, number == null? 1: 1+number);

}

//add word to rightNeighbor

public void addToRightNeighbor(char word){

//rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0));

Integer number = rightNeighbor.get(word);

rightNeighbor.put(word, number == null? 1: 1+number);

}

//increment tf

public void incrementTF(){

tf++;

}

public int getLeftNeighborNumber(){

return leftNeighbor.size();

}

public int getRightNeighborNumber(){

return rightNeighbor.size();

}

public double getLeftNeighborEntropy(){

double entropy = 0;

int sum = 0;

for(int number : leftNeighbor.values()){

entropy += number*Math.log(number);

sum += number;

}

if(sum == 0) return 0;

return Math.log(sum) - entropy/sum;

}

public double getRightNeighborEntropy(){

double entropy = 0;

int sum = 0;

for(int number : rightNeighbor.values()){

entropy += number*Math.log(number);

sum += number;

}

if(sum == 0) return 0;

return Math.log(sum) - entropy/sum;

}

public int getTF(){

return tf;

}

3. Main.java

package com.algo.word;

public class Main {

public static void main(String[] args) {

//if 3 arguments, first argument is input files splitting with ','

//second argument is output file

//output 7 columns split with ',' , like below:

//word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information

//third argument is stop words list

if(args.length == 3)

NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2]);

//if 4 arguments, forth argument is the NGram parameter N

//5th argument is threshold of output words, default is "20,3,3,5"

//output TF > 20 && (left | right) neighbor number > 3 && MI > 5

else if(args.length == 5)

NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2], Integer.parseInt(args[3]), args[4]);

}

以上所述就是本文的全部内容了，希望大家能够喜欢。

Flask接口签名sign原理与实例代码浅析

260 2023-07-30

java使用Nagao算法实现新词发现、热门词的挖掘

Flask接口签名sign原理与实例代码浅析

zookeeper python接口实例详解

hdml指的是什么接口

推荐文章

接口调用是什么意思？几种常用接口调用方式

接口设计原则

8款在线 API 接口文档管理工具

api管理系统是什么？

什么是接口调试？接口调试的步骤有哪些？

api 接口管理系统有哪些？

接口测试有几种测试方法

API文档生成工具有哪些？

微服务和api网关区别

交换机配置步骤

最近发表

热评文章

在线接口文档管理工具推荐，支持在线测试，HTTP接口

开源的在线接口文档wiki工具Mindoc的介绍与使

如何优雅的进行接口设计？接口设计的六大原则是什么？

什么是API测试,api检测公司

遇到百度网址安全中心提醒您该页面可能存在钓鱼欺诈信息

软件接口设计怎么做？前后端分离软件接口设计思路

java使用Nagao算法实现新词发现、热门词的挖掘

微信扫一扫：分享

推荐文章

最近发表

热评文章