/** * Reads the probability table from the given DataInputStream. * * @param stream the DataInputStream from which to read the table * @param bigEndian true if the given stream is bigEndian, false otherwise * @throws java.io.IOException */ private float[] ReadFloatTable(Stream stream, Boolean bigEndian) { var numProbs = ReadInt(stream, bigEndian); if (numProbs <= 0 || numProbs > MaxProbTableSize) { throw new Exception("Bad probabilities table size: " + numProbs); } var probTable = new float[numProbs]; for (var i = 0; i < numProbs; i++) { //probTable[i] = readFloat(stream, bigEndian); probTable[i] = _logMath.Log10ToLog(ReadFloat(stream, bigEndian)); } return(probTable); }
/// <summary> /// Loads the language model from the given location. /// </summary> /// <param name="location">The URL location of the model.</param> /// <param name="unigramWeightValue">The unigram weight.</param> /// <param name="dictionaryValue">The dictionary.</param> private void Load(URL location, float unigramWeightValue, IDictionary dictionaryValue) { string line; float logUnigramWeight = _logMath.LinearToLog(unigramWeightValue); float inverseLogUnigramWeight = _logMath .LinearToLog(1.0 - unigramWeightValue); Open(location); // look for beginning of data ReadUntil("\\data\\"); // look for ngram statements List <int> ngramList = new List <int>(); while ((line = ReadLine()) != null) { if (line.StartsWith("ngram")) { StringTokenizer st = new StringTokenizer(line, " \t\n\r\f="); if (st.countTokens() != 3) { Corrupt("corrupt ngram field " + line + ' ' + st.countTokens()); } st.nextToken(); int index = int.Parse(st.nextToken(), CultureInfo.InvariantCulture.NumberFormat); int count = int.Parse(st.nextToken(), CultureInfo.InvariantCulture.NumberFormat); ngramList.Insert(index - 1, count); MaxDepth = Math.Max(index, MaxDepth); } else if (line.Equals("\\1-grams:")) { break; } } int numUnigrams = ngramList[0] - 1; // -log(x) = log(1/x) float logUniformProbability = -_logMath.LinearToLog(numUnigrams); for (int index = 0; index < ngramList.Count; index++) { int ngram = index + 1; int ngramCount = ngramList[index]; for (int i = 0; i < ngramCount; i++) { StringTokenizer tok = new StringTokenizer(ReadLine()); int tokenCount = tok.countTokens(); if (tokenCount != ngram + 1 && tokenCount != ngram + 2) { Corrupt("Bad format"); } float log10Prob = float.Parse(tok.nextToken(), CultureInfo.InvariantCulture.NumberFormat); float log10Backoff = 0.0f; // construct the WordSequence for this N-Gram List <Word> wordList = new List <Word>(MaxDepth); for (int j = 0; j < ngram; j++) { string word = tok.nextToken(); _vocabulary.Add(word); Word wordObject = dictionaryValue.GetWord(word); if (wordObject == null) { wordObject = Word.Unknown; } wordList.Add(wordObject); } WordSequence wordSequence = new WordSequence(wordList); if (tok.hasMoreTokens()) { log10Backoff = float.Parse(tok.nextToken(), CultureInfo.InvariantCulture.NumberFormat); } float logProb = _logMath.Log10ToLog(log10Prob); float logBackoff = _logMath.Log10ToLog(log10Backoff); // Apply unigram weights if this is a unigram probability if (ngram == 1) { float p1 = logProb + logUnigramWeight; float p2 = logUniformProbability + inverseLogUnigramWeight; logProb = _logMath.AddAsLinear(p1, p2); // System.out // .println("p1 " + p1 + " p2 " + p2 + " luw " // + logUnigramWeight + " iluw " // + inverseLogUnigramWeight + " lup " // + logUniformProbability + " logprog " // + logProb); } Put(wordSequence, logProb, logBackoff); } if (index < ngramList.Count - 1) { string next = "\\" + (ngram + 1) + "-grams:"; ReadUntil(next); } } ReadUntil("\\end\\"); Close(); }