/// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary> /// <remarks>Merges with any dictionary data already loaded.</remarks> /// <param name="corpus">The path+filename of the file.</param> /// <param name="termIndex">The column position of the word.</param> /// <param name="countIndex">The column position of the frequency count.</param> /// <returns>True if file loaded, or false if file not found.</returns> public bool LoadDictionary(TextAsset corpus, int termIndex, int countIndex) { string[] linesInFile = corpus.text.Split('\n'); var staging = new SuggestionStage(16384); foreach (String line in linesInFile) { //process a single line at a time only for memory efficiency string[] lineParts = line.Split(null); if (lineParts.Length >= 2) { string key = lineParts[termIndex]; Int64 count; if (Int64.TryParse(lineParts[countIndex], out count)) { CreateDictionaryEntry(key, count, staging); } } } if (this.deletes == null) { this.deletes = new Dictionary <int, string[]>(staging.DeleteCount); } CommitStaged(staging); return(true); }
//create a frequency dictionary from a corpus (merges with any dictionary data already loaded) /// <summary>Load multiple dictionary words from a file containing plain text.</summary> /// <param name="corpus">The path+filename of the file.</param> /// <returns>True if file loaded, or false if file not found.</returns> public bool CreateDictionary(string corpus) { if (!File.Exists(corpus)) { return(false); } var staging = new SuggestionStage(16384); using (StreamReader sr = new StreamReader(File.OpenRead(corpus))) { String line; //process a single line at a time only for memory efficiency while ((line = sr.ReadLine()) != null) { foreach (string key in ParseWords(line)) { CreateDictionaryEntry(key, 1, staging); } } } if (this.deletes == null) { this.deletes = new Dictionary <int, string[]>(staging.DeleteCount); } CommitStaged(staging); return(true); }
public override bool LoadDictionary(Stream instr, Uyghur.YEZIQ yeziq) { gYeziq = yeziq; var staging = new SuggestionStage(16384); using (StreamReader sr = new StreamReader(instr, System.Text.Encoding.UTF8)) { String line; //process a single line at a time only for memory efficiency while ((line = sr.ReadLine()) != null) { string[] lineParts = line.Split(null); if (lineParts.Length >= 2) { string key = lineParts[0]; if (yeziq == Uyghur.YEZIQ.ULY) { key = Uyghur.UEY2ULY(key).ToLower(); } else if (yeziq == Uyghur.YEZIQ.USY) { key = Uyghur.UEY2USY(key).ToLower(); } //Int64 count; Int64 count; if (Int64.TryParse(lineParts[1], out count)) { Add(key, count); } } } } if (this.deletes == null) { this.deletes = new Dictionary <int, string[]>(staging.DeleteCount); } CommitStaged(staging); return(true); }
/// <summary>Commit staged dictionary additions.</summary> /// <remarks>Used when you write your own process to load multiple words into the /// dictionary, and as part of that process, you first created a SuggestionsStage /// object, and passed that to CreateDictionaryEntry calls.</remarks> /// <param name="staging">The SuggestionStage object storing the staged data.</param> public void CommitStaged(SuggestionStage staging) { staging.CommitTo(deletes); }
/// <summary>Create/Update an entry in the dictionary.</summary> /// <remarks>For every word there are deletes with an edit distance of 1..maxEditDistance created and added to the /// dictionary. Every delete entry has a suggestions list, which points to the original term(s) it was created from. /// The dictionary may be dynamically updated (word frequency and new words) at any time by calling CreateDictionaryEntry</remarks> /// <param name="key">The word to add to dictionary.</param> /// <param name="count">The frequency count for word.</param> /// <param name="staging">Optional staging object to speed up adding many entries by staging them to a temporary structure.</param> /// <returns>True if the word was added as a new correctly spelled word, /// or false if the word is added as a below threshold word, or updates an /// existing correctly spelled word.</returns> public bool CreateDictionaryEntry(string key, Int64 count, SuggestionStage staging = null) { if (count <= 0) { if (this.countThreshold > 0) { return(false); // no point doing anything if count is zero, as it can't change anything } count = 0; } Int64 countPrevious = -1; // look first in below threshold words, update count, and allow promotion to correct spelling word if count reaches threshold // threshold must be >1 for there to be the possibility of low threshold words if (countThreshold > 1 && belowThresholdWords.TryGetValue(key, out countPrevious)) { // calculate new count for below threshold word count = (Int64.MaxValue - countPrevious > count) ? countPrevious + count : Int64.MaxValue; // has reached threshold - remove from below threshold collection (it will be added to correct words below) if (count >= countThreshold) { belowThresholdWords.Remove(key); } else { belowThresholdWords[key] = count; return(false); } } else if (words.TryGetValue(key, out countPrevious)) { // just update count if it's an already added above threshold word count = (Int64.MaxValue - countPrevious > count) ? countPrevious + count : Int64.MaxValue; words[key] = count; return(false); } else if (count < CountThreshold) { // new or existing below threshold word belowThresholdWords[key] = count; return(false); } // what we have at this point is a new, above threshold word words.Add(key, count); //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word if (key.Length > maxLength) { maxLength = key.Length; } //create deletes var edits = EditsPrefix(key); // if not staging suggestions, put directly into main data structure if (staging != null) { foreach (string delete in edits) { staging.Add(GetStringHash(delete), key); } } else { if (deletes == null) { this.deletes = new Dictionary <int, string[]>(initialCapacity); //initialisierung } foreach (string delete in edits) { int deleteHash = GetStringHash(delete); string[] suggestions; if (deletes.TryGetValue(deleteHash, out suggestions)) { var newSuggestions = new string[suggestions.Length + 1]; Array.Copy(suggestions, newSuggestions, suggestions.Length); deletes[deleteHash] = suggestions = newSuggestions; } else { suggestions = new string[1]; deletes.Add(deleteHash, suggestions); } suggestions[suggestions.Length - 1] = key; } } return(true); }