/// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary>
    /// <remarks>Merges with any dictionary data already loaded.</remarks>
    /// <param name="corpus">The path+filename of the file.</param>
    /// <param name="termIndex">The column position of the word.</param>
    /// <param name="countIndex">The column position of the frequency count.</param>
    /// <returns>True if file loaded, or false if file not found.</returns>
    public bool LoadDictionary(TextAsset corpus, int termIndex, int countIndex)
    {
        string[] linesInFile = corpus.text.Split('\n');
        var      staging     = new SuggestionStage(16384);

        foreach (String line in linesInFile)
        {
            //process a single line at a time only for memory efficiency

            string[] lineParts = line.Split(null);
            if (lineParts.Length >= 2)
            {
                string key = lineParts[termIndex];
                Int64  count;
                if (Int64.TryParse(lineParts[countIndex], out count))
                {
                    CreateDictionaryEntry(key, count, staging);
                }
            }
        }
        if (this.deletes == null)
        {
            this.deletes = new Dictionary <int, string[]>(staging.DeleteCount);
        }
        CommitStaged(staging);
        return(true);
    }
    //create a frequency dictionary from a corpus (merges with any dictionary data already loaded)
    /// <summary>Load multiple dictionary words from a file containing plain text.</summary>
    /// <param name="corpus">The path+filename of the file.</param>
    /// <returns>True if file loaded, or false if file not found.</returns>
    public bool CreateDictionary(string corpus)
    {
        if (!File.Exists(corpus))
        {
            return(false);
        }
        var staging = new SuggestionStage(16384);

        using (StreamReader sr = new StreamReader(File.OpenRead(corpus)))
        {
            String line;
            //process a single line at a time only for memory efficiency
            while ((line = sr.ReadLine()) != null)
            {
                foreach (string key in ParseWords(line))
                {
                    CreateDictionaryEntry(key, 1, staging);
                }
            }
        }
        if (this.deletes == null)
        {
            this.deletes = new Dictionary <int, string[]>(staging.DeleteCount);
        }
        CommitStaged(staging);
        return(true);
    }
Exemplo n.º 3
0
        public override bool LoadDictionary(Stream instr, Uyghur.YEZIQ yeziq)
        {
            gYeziq = yeziq;
            var staging = new SuggestionStage(16384);

            using (StreamReader sr = new StreamReader(instr, System.Text.Encoding.UTF8))
            {
                String line;

                //process a single line at a time only for memory efficiency
                while ((line = sr.ReadLine()) != null)
                {
                    string[] lineParts = line.Split(null);
                    if (lineParts.Length >= 2)
                    {
                        string key = lineParts[0];
                        if (yeziq == Uyghur.YEZIQ.ULY)
                        {
                            key = Uyghur.UEY2ULY(key).ToLower();
                        }
                        else if (yeziq == Uyghur.YEZIQ.USY)
                        {
                            key = Uyghur.UEY2USY(key).ToLower();
                        }
                        //Int64 count;
                        Int64 count;
                        if (Int64.TryParse(lineParts[1], out count))
                        {
                            Add(key, count);
                        }
                    }
                }
            }
            if (this.deletes == null)
            {
                this.deletes = new Dictionary <int, string[]>(staging.DeleteCount);
            }
            CommitStaged(staging);
            return(true);
        }
Exemplo n.º 4
0
 /// <summary>Commit staged dictionary additions.</summary>
 /// <remarks>Used when you write your own process to load multiple words into the
 /// dictionary, and as part of that process, you first created a SuggestionsStage
 /// object, and passed that to CreateDictionaryEntry calls.</remarks>
 /// <param name="staging">The SuggestionStage object storing the staged data.</param>
 public void CommitStaged(SuggestionStage staging)
 {
     staging.CommitTo(deletes);
 }
    /// <summary>Create/Update an entry in the dictionary.</summary>
    /// <remarks>For every word there are deletes with an edit distance of 1..maxEditDistance created and added to the
    /// dictionary. Every delete entry has a suggestions list, which points to the original term(s) it was created from.
    /// The dictionary may be dynamically updated (word frequency and new words) at any time by calling CreateDictionaryEntry</remarks>
    /// <param name="key">The word to add to dictionary.</param>
    /// <param name="count">The frequency count for word.</param>
    /// <param name="staging">Optional staging object to speed up adding many entries by staging them to a temporary structure.</param>
    /// <returns>True if the word was added as a new correctly spelled word,
    /// or false if the word is added as a below threshold word, or updates an
    /// existing correctly spelled word.</returns>
    public bool CreateDictionaryEntry(string key, Int64 count, SuggestionStage staging = null)
    {
        if (count <= 0)
        {
            if (this.countThreshold > 0)
            {
                return(false);                                     // no point doing anything if count is zero, as it can't change anything
            }
            count = 0;
        }
        Int64 countPrevious = -1;

        // look first in below threshold words, update count, and allow promotion to correct spelling word if count reaches threshold
        // threshold must be >1 for there to be the possibility of low threshold words
        if (countThreshold > 1 && belowThresholdWords.TryGetValue(key, out countPrevious))
        {
            // calculate new count for below threshold word
            count = (Int64.MaxValue - countPrevious > count) ? countPrevious + count : Int64.MaxValue;
            // has reached threshold - remove from below threshold collection (it will be added to correct words below)
            if (count >= countThreshold)
            {
                belowThresholdWords.Remove(key);
            }
            else
            {
                belowThresholdWords[key] = count;
                return(false);
            }
        }
        else if (words.TryGetValue(key, out countPrevious))
        {
            // just update count if it's an already added above threshold word
            count      = (Int64.MaxValue - countPrevious > count) ? countPrevious + count : Int64.MaxValue;
            words[key] = count;
            return(false);
        }
        else if (count < CountThreshold)
        {
            // new or existing below threshold word
            belowThresholdWords[key] = count;
            return(false);
        }

        // what we have at this point is a new, above threshold word
        words.Add(key, count);

        //edits/suggestions are created only once, no matter how often word occurs
        //edits/suggestions are created only as soon as the word occurs in the corpus,
        //even if the same term existed before in the dictionary as an edit from another word
        if (key.Length > maxLength)
        {
            maxLength = key.Length;
        }

        //create deletes
        var edits = EditsPrefix(key);

        // if not staging suggestions, put directly into main data structure
        if (staging != null)
        {
            foreach (string delete in edits)
            {
                staging.Add(GetStringHash(delete), key);
            }
        }
        else
        {
            if (deletes == null)
            {
                this.deletes = new Dictionary <int, string[]>(initialCapacity);                             //initialisierung
            }
            foreach (string delete in edits)
            {
                int      deleteHash = GetStringHash(delete);
                string[] suggestions;
                if (deletes.TryGetValue(deleteHash, out suggestions))
                {
                    var newSuggestions = new string[suggestions.Length + 1];
                    Array.Copy(suggestions, newSuggestions, suggestions.Length);
                    deletes[deleteHash] = suggestions = newSuggestions;
                }
                else
                {
                    suggestions = new string[1];
                    deletes.Add(deleteHash, suggestions);
                }
                suggestions[suggestions.Length - 1] = key;
            }
        }
        return(true);
    }