public class NGramUtils
extends java.lang.Object
Constructor and Description |
---|
NGramUtils() |
Modifier and Type | Method and Description |
---|---|
static double |
calculateBigramMLProbability(java.lang.String x0,
java.lang.String x1,
java.util.Collection<StringList> set)
calculate the probability of a bigram in a vocabulary using maximum likelihood estimation
|
static double |
calculateBigramPriorSmoothingProbability(java.lang.String x0,
java.lang.String x1,
java.util.Collection<StringList> set,
java.lang.Double k)
calculate the probability of a bigram in a vocabulary using prior Laplace smoothing algorithm
|
static double |
calculateLaplaceSmoothingProbability(StringList ngram,
java.lang.Iterable<StringList> set,
java.lang.Double k)
calculate the probability of a ngram in a vocabulary using Laplace smoothing algorithm
|
static double |
calculateMissingNgramProbabilityMass(StringList ngram,
java.lang.Double discount,
java.lang.Iterable<StringList> set)
calculate the probability of a ngram in a vocabulary using the missing probability mass algorithm
|
static double |
calculateNgramMLProbability(StringList ngram,
java.lang.Iterable<StringList> set)
calculate the probability of a ngram in a vocabulary using maximum likelihood estimation
|
static double |
calculateTrigramLinearInterpolationProbability(java.lang.String x0,
java.lang.String x1,
java.lang.String x2,
java.util.Collection<StringList> set,
java.lang.Double lambda1,
java.lang.Double lambda2,
java.lang.Double lambda3)
calculate the probability of a trigram in a vocabulary using a linear interpolation algorithm
|
static double |
calculateTrigramMLProbability(java.lang.String x0,
java.lang.String x1,
java.lang.String x2,
java.lang.Iterable<StringList> set)
calculate the probability of a trigram in a vocabulary using maximum likelihood estimation
|
static double |
calculateUnigramMLProbability(java.lang.String word,
java.util.Collection<StringList> set)
calculate the probability of a unigram in a vocabulary using maximum likelihood estimation
|
static java.util.Collection<java.lang.String[]> |
getNGrams(java.lang.String[] sequence,
int size)
Get the ngrams of dimension n of a certain input sequence of tokens.
|
static java.util.Collection<StringList> |
getNGrams(StringList sequence,
int size)
Get the ngrams of dimension n of a certain input sequence of tokens.
|
static StringList |
getNMinusOneTokenFirst(StringList ngram)
get the (n-1)th ngram of a given ngram, that is the same ngram except the last word in the ngram
|
static StringList |
getNMinusOneTokenLast(StringList ngram)
get the (n-1)th ngram of a given ngram, that is the same ngram except the first word in the ngram
|
public static double calculateLaplaceSmoothingProbability(StringList ngram, java.lang.Iterable<StringList> set, java.lang.Double k)
ngram
- the ngram to get the probability forset
- the vocabularyk
- the smoothing factorpublic static double calculateUnigramMLProbability(java.lang.String word, java.util.Collection<StringList> set)
word
- the only word in the unigramset
- the vocabularypublic static double calculateBigramMLProbability(java.lang.String x0, java.lang.String x1, java.util.Collection<StringList> set)
x0
- first word in the bigramx1
- second word in the bigramset
- the vocabularypublic static double calculateTrigramMLProbability(java.lang.String x0, java.lang.String x1, java.lang.String x2, java.lang.Iterable<StringList> set)
x0
- first word in the trigramx1
- second word in the trigramx2
- third word in the trigramset
- the vocabularypublic static double calculateNgramMLProbability(StringList ngram, java.lang.Iterable<StringList> set)
ngram
- a ngramset
- the vocabularypublic static double calculateBigramPriorSmoothingProbability(java.lang.String x0, java.lang.String x1, java.util.Collection<StringList> set, java.lang.Double k)
x0
- the first word in the bigramx1
- the second word in the bigramset
- the vocabularyk
- the smoothing factorpublic static double calculateTrigramLinearInterpolationProbability(java.lang.String x0, java.lang.String x1, java.lang.String x2, java.util.Collection<StringList> set, java.lang.Double lambda1, java.lang.Double lambda2, java.lang.Double lambda3)
x0
- the first word in the trigramx1
- the second word in the trigramx2
- the third word in the trigramset
- the vocabularylambda1
- trigram interpolation factorlambda2
- bigram interpolation factorlambda3
- unigram interpolation factorpublic static double calculateMissingNgramProbabilityMass(StringList ngram, java.lang.Double discount, java.lang.Iterable<StringList> set)
ngram
- the ngramdiscount
- discount factorset
- the vocabularypublic static StringList getNMinusOneTokenFirst(StringList ngram)
ngram
- a ngrampublic static StringList getNMinusOneTokenLast(StringList ngram)
ngram
- a ngrampublic static java.util.Collection<StringList> getNGrams(StringList sequence, int size)
sequence
- a sequence of tokenssize
- the size of the resulting ngrmamspublic static java.util.Collection<java.lang.String[]> getNGrams(java.lang.String[] sequence, int size)
sequence
- a sequence of tokenssize
- the size of the resulting ngrmamsCopyright © 2010 - 2023 Adobe. All Rights Reserved