Exemplo n.º 1
0
        /**
         * Reads the probability table from the given DataInputStream.
         *
         * @param stream    the DataInputStream from which to read the table
         * @param bigEndian true if the given stream is bigEndian, false otherwise
         * @throws java.io.IOException
         */
        private float[] ReadFloatTable(Stream stream, Boolean bigEndian)
        {
            var numProbs = ReadInt(stream, bigEndian);

            if (numProbs <= 0 || numProbs > MaxProbTableSize)
            {
                throw new Exception("Bad probabilities table size: " + numProbs);
            }

            var probTable = new float[numProbs];

            for (var i = 0; i < numProbs; i++)
            {
                //probTable[i] = readFloat(stream, bigEndian);
                probTable[i] = _logMath.Log10ToLog(ReadFloat(stream, bigEndian));
            }

            return(probTable);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Loads the language model from the given location.
        /// </summary>
        /// <param name="location">The URL location of the model.</param>
        /// <param name="unigramWeightValue">The unigram weight.</param>
        /// <param name="dictionaryValue">The dictionary.</param>
        private void Load(URL location, float unigramWeightValue, IDictionary dictionaryValue)
        {
            string line;
            float  logUnigramWeight        = _logMath.LinearToLog(unigramWeightValue);
            float  inverseLogUnigramWeight = _logMath
                                             .LinearToLog(1.0 - unigramWeightValue);

            Open(location);
            // look for beginning of data
            ReadUntil("\\data\\");
            // look for ngram statements
            List <int> ngramList = new List <int>();

            while ((line = ReadLine()) != null)
            {
                if (line.StartsWith("ngram"))
                {
                    StringTokenizer st = new StringTokenizer(line, " \t\n\r\f=");
                    if (st.countTokens() != 3)
                    {
                        Corrupt("corrupt ngram field " + line + ' '
                                + st.countTokens());
                    }
                    st.nextToken();
                    int index = int.Parse(st.nextToken(), CultureInfo.InvariantCulture.NumberFormat);
                    int count = int.Parse(st.nextToken(), CultureInfo.InvariantCulture.NumberFormat);
                    ngramList.Insert(index - 1, count);
                    MaxDepth = Math.Max(index, MaxDepth);
                }
                else if (line.Equals("\\1-grams:"))
                {
                    break;
                }
            }
            int numUnigrams = ngramList[0] - 1;
            // -log(x) = log(1/x)
            float logUniformProbability = -_logMath.LinearToLog(numUnigrams);

            for (int index = 0; index < ngramList.Count; index++)
            {
                int ngram      = index + 1;
                int ngramCount = ngramList[index];
                for (int i = 0; i < ngramCount; i++)
                {
                    StringTokenizer tok        = new StringTokenizer(ReadLine());
                    int             tokenCount = tok.countTokens();
                    if (tokenCount != ngram + 1 && tokenCount != ngram + 2)
                    {
                        Corrupt("Bad format");
                    }
                    float log10Prob    = float.Parse(tok.nextToken(), CultureInfo.InvariantCulture.NumberFormat);
                    float log10Backoff = 0.0f;
                    // construct the WordSequence for this N-Gram
                    List <Word> wordList = new List <Word>(MaxDepth);
                    for (int j = 0; j < ngram; j++)
                    {
                        string word = tok.nextToken();
                        _vocabulary.Add(word);
                        Word wordObject = dictionaryValue.GetWord(word);
                        if (wordObject == null)
                        {
                            wordObject = Word.Unknown;
                        }
                        wordList.Add(wordObject);
                    }
                    WordSequence wordSequence = new WordSequence(wordList);
                    if (tok.hasMoreTokens())
                    {
                        log10Backoff = float.Parse(tok.nextToken(), CultureInfo.InvariantCulture.NumberFormat);
                    }
                    float logProb    = _logMath.Log10ToLog(log10Prob);
                    float logBackoff = _logMath.Log10ToLog(log10Backoff);
                    // Apply unigram weights if this is a unigram probability
                    if (ngram == 1)
                    {
                        float p1 = logProb + logUnigramWeight;
                        float p2 = logUniformProbability + inverseLogUnigramWeight;
                        logProb = _logMath.AddAsLinear(p1, p2);
                        // System.out
                        // .println("p1 " + p1 + " p2 " + p2 + " luw "
                        // + logUnigramWeight + " iluw "
                        // + inverseLogUnigramWeight + " lup "
                        // + logUniformProbability + " logprog "
                        // + logProb);
                    }
                    Put(wordSequence, logProb, logBackoff);
                }
                if (index < ngramList.Count - 1)
                {
                    string next = "\\" + (ngram + 1) + "-grams:";
                    ReadUntil(next);
                }
            }
            ReadUntil("\\end\\");
            Close();
        }