public static LanguageDictionary fromGCIDE_XML(string xmlPath = @"X:\Programming\Resources\dictionaries\gcide_xml") { List <GCIDEWord> wordList = LoadXML(); //Dictionary<string, GCIDEWord> words = GCIDEWord.formulateSimepleDictionary(wordList); List <GCIDEWord> words = GCIDEWord.formulateDictionary(wordList); //words.Add("razzamatazz", new GCIDEWord("razzamatazz", "A flamboyant (gaudy) display intended to awe, impress, bewilder, confuse or deceive. 2. razzle-dazzle", "n.")); List <GCIDEWord> badWords = words.Where(W => !GCIDEWord.isNormalWord(W)).ToList(); foreach (var badWord in badWords) { Console.WriteLine("Pruning: " + badWord); words.Remove(badWord); } //ExportDictionary(words, out maxWord, out maxDef); //frequency stuff /*GC.Collect(); * string refText = File.ReadAllText(@"X:\uni\c++ assignment\all - ansi - clean.txt"); * string[] allWords = getAllWords(refText); * foreach (string str in allWords) * { * //proper noun match? * List<GCIDEWord> sameWords = words.Where(W => W.Text == str).ToList(); * if (sameWords.Count == 0) * { * string strClean = GCIDEWord.cleanWordText(str); * sameWords = words.Where(W => W.Text == strClean).ToList(); * } * * foreach (GCIDEWord sameWord in sameWords) * { * sameWord.Freq = sameWord.Freq + 1; * } * } * * allWords = null; refText = null;*/ GC.Collect(); //build the dictionary LanguageDictionary dic = new LanguageDictionary(); foreach (GCIDEWord word in words) { dic.MergeWordIntoSortedDictionary(word.toWord(dic)); } return(dic); }
private static void ExportDictionary(Dictionary <string, GCIDEWord> words, out int maxWord, out int maxDef) { StringBuilder outText = new StringBuilder(); maxWord = 0; maxDef = 0; Dictionary <string, int> uniqueTypeStrings = new Dictionary <string, int>(); foreach (var entry in words.OrderBy(KVP => KVP.Key)) { GCIDEWord word = entry.Value; outText.AppendLine(word.Text); outText.AppendLine(word.Definition); outText.AppendLine(word.BasicType); outText.AppendLine(); maxWord = Math.Max(maxWord, word.Text.Length); maxDef = Math.Max(maxDef, word.Definition.Length); string type = word.BasicType; if (!uniqueTypeStrings.ContainsKey(type)) { uniqueTypeStrings.Add(type, 1); } else { uniqueTypeStrings[type] = uniqueTypeStrings[type] + 1; } if (word.Text.ToLower().Contains("zz"))//Count(C => C == 'z') >= 2) { Console.Out.WriteLine("ZZZZ: " + word.Text); } } foreach (var entry in uniqueTypeStrings) { string count = "" + entry.Value; string what = entry.Key; while (what.Length < 10) { what += " "; } Console.WriteLine("UTS: " + what + " " + count); } File.WriteAllText("dictionary.txt", outText.ToString()); }
private static List <GCIDEWord> LoadXML() { List <GCIDEWord> words = new List <GCIDEWord>(); try { Console.WriteLine("xml fix up"); string dir = @"X:\Programming\Resources\dictionaries\gcide_xml\xml_files"; string file = Path.Combine(dir, string.Format("gcide_{0}.xml", 'a')); string rawXML = File.ReadAllText(file); string fixedXml = string.Format("<dictionary>\r\n{0}\r\n</dictionary>", rawXML); File.WriteAllText("fixedxml.xml", fixedXml); XmlReaderSettings settings = new XmlReaderSettings(); settings.DtdProcessing = DtdProcessing.Parse; settings.ValidationType = ValidationType.DTD; settings.MaxCharactersFromEntities = 64 * 1024 * 1024; XmlReader reader = XmlReader.Create(Path.Combine(dir, "gcide.xml"), settings); XmlDocument xml = new XmlDocument(); //xml.Load("fixedxml.xml"); Console.WriteLine("xml loading"); xml.Load(reader); XmlNodeList firstNodes = xml.ChildNodes; Queue <XmlNode> nodes = new Queue <XmlNode>(); foreach (XmlNode node in firstNodes) { nodes.Enqueue(node); } bool multipleDefs = false; Console.WriteLine("parsing xml"); char lastLetter = (char)('a' - 1); while (nodes.Count > 0) //where is the isEmpty method? { XmlNode node = nodes.Dequeue(); bool synonim = false; if (node.Name == "p") { //parse the xml XmlNode defNode = node.SelectSingleNode("def"); XmlNode wordNode = node.SelectSingleNode("hw"); if (wordNode == null) { wordNode = getChild(node, "mhw", "hw"); } if (wordNode == null) { wordNode = node.SelectSingleNode("sn"); synonim = true; } //multidef? if (wordNode == null) { wordNode = node.SelectSingleNode("h1"); if (wordNode != null) { multipleDefs = true; } } else { multipleDefs = false; } //add the word to the dictionary if (defNode != null) { /*if (wordNode != null) if (wordNode.InnerText.Trim() == "2.") * { * Console.WriteLine("debug"); * }*/ if (synonim) { words.Last().appendDefinition(defNode.InnerText); } else if (wordNode != null) { //Console.WriteLine(wordNode.InnerText); //Console.WriteLine(defNode.InnerText); //Console.WriteLine(); //get type string type = ""; XmlNode typeNode = node.SelectSingleNode("pos"); typeNode = (typeNode == null) ? defNode.SelectSingleNode("pos") : typeNode; if (typeNode != null) { type = typeNode.InnerText; } else { //Console.WriteLine("word without type"); } //save word GCIDEWord word = new GCIDEWord(wordNode.InnerText, defNode.InnerText, type); words.Add(word); //status if (word.Text.ToLower()[0] > lastLetter) { lastLetter = word.Text.ToLower()[0]; Console.Write(lastLetter + " "); } if (word.isNormalWord()) { if (word.hasOddChars()) { Console.WriteLine("ERROR"); } } } else if (multipleDefs) { words.Last().appendDefinition(defNode.InnerText); } else { Console.WriteLine("#ERROR: def without a word"); } } } else { foreach (XmlNode childNode in node.ChildNodes) { nodes.Enqueue(childNode); } } } } catch (Exception ex) { Console.WriteLine(ex); } return(words); }