private static WordnetEntry ParseLine(string line) { if (string.IsNullOrWhiteSpace(line)) { return(null); } if (!line.StartsWith("0")) { return(null); } var parts = line.Split(space, StringSplitOptions.RemoveEmptyEntries); WordnetEntry result = new WordnetEntry(); string partOfSpeech = parts[2]; if (partOfSpeech == "s") { partOfSpeech = "a"; } if (partOfSpeech == "r") { partOfSpeech = "b"; } result.Id = partOfSpeech + parts[0].TrimStart('0'); int wordCount = int.Parse(parts[3], System.Globalization.NumberStyles.HexNumber); for (int i = 0; i < wordCount; i++) { result.Names.Add(parts[4 + (i * 2)]); } int defStart = line.IndexOf('|'); if (defStart != -1) { result.Description = line.Substring(defStart + 1).Trim(); } return(result); }
static void Main(string[] args) { currentDir = Directory.GetCurrentDirectory(); distDir = Path.Combine(currentDir, "dist"); GatherDistData(); EstonianWordnetParser parser = new EstonianWordnetParser(); var synSets = parser.Parse(Path.Combine(distDir, "kb73-utf8.txt")); parser = null; // remember assigned uuids Dictionary <string, Guid> idMapping = new Dictionary <string, Guid>(); if (File.Exists(Path.Combine(currentDir, "idmapping.txt"))) { idMapping = File.ReadAllLines(Path.Combine(currentDir, "idmapping.txt"), Encoding.UTF8).ToDictionary(x => x.Split(':')[0], v => new Guid(v.Split(':')[1])); } VersionableBase.DefaultAgencyId = "ee.stat"; long version = 1; var concepts = new Dictionary <string, Concept>(); var idMappingLines = new List <string>(); Dictionary <string, string> literalToId = new Dictionary <string, string>(); foreach (var synSet in synSets) { var estonianId = synSet.Token.Trim('@'); var concept = new Concept() { Version = version }; Guid uuid; if (idMapping.TryGetValue(estonianId, out uuid)) { concept.Identifier = uuid; } idMappingLines.Add(estonianId + ":" + concept.Identifier); concepts.Add(estonianId, concept); concept.UserIds.Add(new UserId("estonianWordnet", estonianId)); var partOfSpeech = synSet.Children.Where(x => x.Token == PART_OF_SPEECH).First().ValueUnquoted; var literals = synSet.Children.Where(x => x.Token == VARIANTS).First().Children.Where(x => x.Token == LITERAL).ToList(); for (int i = 0; i < literals.Count; ++i) { var literal = literals[i]; var sense = literal.Children.Where(x => x.Token == SENSE).First().Value; var name = literal.ValueUnquoted; literalToId.Add(partOfSpeech + ":" + name + ":" + sense, estonianId); if (i == 0) { concept.ItemName["et-EE"] = name; } if (concept.Label.IsEmpty) { concept.Label["et-EE"] = name; } else { concept.Label["et-EE"] = concept.Label["et-EE"] + ", " + name; } var definition = literal.Children.Where(x => x.Token == DEFINITION).Select(y => y.ValueUnquoted).FirstOrDefault(); var examples = literal.Children.Where(x => x.Token == EXAMPLES) .SelectMany(x => x.Children.Where(y => y.Token == EXAMPLE)) .Select(z => z.ValueUnquoted).ToList(); StringBuilder sb = new StringBuilder(); string currentDescription = null; if (concept.Description.TryGetValue("et-EE", out currentDescription)) { sb.Append(currentDescription); } string fourSpaces = ""; if (literals.Count > 1) { sb.Append("+ "); sb.AppendLine(name); fourSpaces = " "; } if (!string.IsNullOrWhiteSpace(definition)) { sb.Append(fourSpaces); sb.Append("- "); sb.AppendLine(definition); } foreach (var example in examples) { sb.Append(fourSpaces); sb.Append("- "); sb.AppendLine(example); } concept.Description["et-EE"] = sb.ToString(); } var wordnetSynonyms = synSet.Children.Where(x => x.Token == EQ_LINKS) .SelectMany(x => x.Children.Where(y => y.ValueUnquoted == "eq_synonym" || y.ValueUnquoted == "eq_near_synonym")) .SelectMany(a => a.Children.Where(b => b.Token == TARGET_ILI)).ToList(); foreach (var ili in wordnetSynonyms) { if (ili.Children[1].Token != WORDNET_OFFSET) { continue; } string partOfSpeechRef = ili.Children[0].ValueUnquoted; string wordnetOffset = ili.Children[1].ValueUnquoted; concept.UserIds.Add(new UserId("wordnet15", partOfSpeechRef + wordnetOffset)); concept.UserAttributes.Add(new UserAttribute() { Key = "x:wordnet15PartOfSpeech", Value = partOfSpeechRef }); concept.UserAttributes.Add(new UserAttribute() { Key = "x:wordnet15WordnetOffset", Value = wordnetOffset }); } } synSets = null; Dictionary <string, WordnetEntry> pairs = new Dictionary <string, WordnetEntry>(); WordnetParser.AddToDictionary(Path.Combine(distDir, "DICT", "ADJ.DAT"), pairs); WordnetParser.AddToDictionary(Path.Combine(distDir, "DICT", "ADV.DAT"), pairs); WordnetParser.AddToDictionary(Path.Combine(distDir, "DICT", "VERB.DAT"), pairs); WordnetParser.AddToDictionary(Path.Combine(distDir, "DICT", "NOUN.DAT"), pairs); foreach (var concept in concepts.Values) { List <string> wordnetIds = concept.UserIds.Where(y => y.Type == "wordnet15").Select(x => x.Identifier).ToList(); WordnetEntry entry = null; foreach (var wordnetId in wordnetIds) { if (pairs.TryGetValue(wordnetId, out entry)) { concept.ItemName["en-US"] = entry.Names.First(); concept.Label["en-US"] = string.Join(", ", entry.Names); if (!string.IsNullOrWhiteSpace(entry.Description)) { concept.Description["en-US"] = entry.Description; } } else { throw new InvalidOperationException("Unknown wordnet id"); } } } // has_hyperonym for parent var relations = File.ReadAllLines(Path.Combine(distDir, "kb73-utf8.rix"), Encoding.UTF8); foreach (var relation in relations) { var parts = relation.Split(':'); var subjectId = parts[0]; var predicate = parts[2]; var objectPartOfSpeech = parts[3]; var objectLiteral = parts[4]; var objectSense = parts[5]; if (predicate != "has_hyperonym") { continue; } string objectId; if (literalToId.TryGetValue($"{objectPartOfSpeech}:{objectLiteral}:{objectSense}", out objectId)) { var subjectConcept = concepts[subjectId]; var objectConcept = concepts[objectId]; subjectConcept.SubclassOf.Add(objectConcept); } else { throw new InvalidOperationException("Unknown literal"); } } FragmentInstance instance = new FragmentInstance(); instance.Items.Merge(concepts.Values); WriteFragment(Path.Combine(currentDir, "estonian-wordnet.ddi32.xml"), instance); // compress the DDI since it is large using (FileStream fs = new FileStream(Path.Combine(currentDir, "estonian-wordnet.ddi32.xml.zip"), FileMode.Create)) using (ZipArchive archive = new ZipArchive(fs, ZipArchiveMode.Create)) { archive.CreateEntryFromFile(Path.Combine(currentDir, "estonian-wordnet.ddi32.xml"), "estonian-wordnet.ddi32.xml"); } File.WriteAllLines(Path.Combine(currentDir, "idmapping.txt"), idMappingLines, Encoding.UTF8); }