Example #1
0
        private static WordnetEntry ParseLine(string line)
        {
            if (string.IsNullOrWhiteSpace(line))
            {
                return(null);
            }
            if (!line.StartsWith("0"))
            {
                return(null);
            }

            var parts = line.Split(space, StringSplitOptions.RemoveEmptyEntries);

            WordnetEntry result = new WordnetEntry();

            string partOfSpeech = parts[2];

            if (partOfSpeech == "s")
            {
                partOfSpeech = "a";
            }
            if (partOfSpeech == "r")
            {
                partOfSpeech = "b";
            }

            result.Id = partOfSpeech + parts[0].TrimStart('0');

            int wordCount = int.Parse(parts[3], System.Globalization.NumberStyles.HexNumber);

            for (int i = 0; i < wordCount; i++)
            {
                result.Names.Add(parts[4 + (i * 2)]);
            }

            int defStart = line.IndexOf('|');

            if (defStart != -1)
            {
                result.Description = line.Substring(defStart + 1).Trim();
            }
            return(result);
        }
Example #2
0
        static void Main(string[] args)
        {
            currentDir = Directory.GetCurrentDirectory();
            distDir    = Path.Combine(currentDir, "dist");

            GatherDistData();

            EstonianWordnetParser parser = new EstonianWordnetParser();
            var synSets = parser.Parse(Path.Combine(distDir, "kb73-utf8.txt"));

            parser = null;

            // remember assigned uuids
            Dictionary <string, Guid> idMapping = new Dictionary <string, Guid>();

            if (File.Exists(Path.Combine(currentDir, "idmapping.txt")))
            {
                idMapping = File.ReadAllLines(Path.Combine(currentDir, "idmapping.txt"), Encoding.UTF8).ToDictionary(x => x.Split(':')[0], v => new Guid(v.Split(':')[1]));
            }



            VersionableBase.DefaultAgencyId = "ee.stat";
            long version = 1;

            var concepts       = new Dictionary <string, Concept>();
            var idMappingLines = new List <string>();
            Dictionary <string, string> literalToId = new Dictionary <string, string>();

            foreach (var synSet in synSets)
            {
                var estonianId = synSet.Token.Trim('@');

                var concept = new Concept()
                {
                    Version = version
                };
                Guid uuid;
                if (idMapping.TryGetValue(estonianId, out uuid))
                {
                    concept.Identifier = uuid;
                }
                idMappingLines.Add(estonianId + ":" + concept.Identifier);
                concepts.Add(estonianId, concept);
                concept.UserIds.Add(new UserId("estonianWordnet", estonianId));

                var partOfSpeech = synSet.Children.Where(x => x.Token == PART_OF_SPEECH).First().ValueUnquoted;

                var literals = synSet.Children.Where(x => x.Token == VARIANTS).First().Children.Where(x => x.Token == LITERAL).ToList();

                for (int i = 0; i < literals.Count; ++i)
                {
                    var literal = literals[i];
                    var sense   = literal.Children.Where(x => x.Token == SENSE).First().Value;
                    var name    = literal.ValueUnquoted;

                    literalToId.Add(partOfSpeech + ":" + name + ":" + sense, estonianId);


                    if (i == 0)
                    {
                        concept.ItemName["et-EE"] = name;
                    }

                    if (concept.Label.IsEmpty)
                    {
                        concept.Label["et-EE"] = name;
                    }
                    else
                    {
                        concept.Label["et-EE"] = concept.Label["et-EE"] + ", " + name;
                    }


                    var definition = literal.Children.Where(x => x.Token == DEFINITION).Select(y => y.ValueUnquoted).FirstOrDefault();


                    var examples = literal.Children.Where(x => x.Token == EXAMPLES)
                                   .SelectMany(x => x.Children.Where(y => y.Token == EXAMPLE))
                                   .Select(z => z.ValueUnquoted).ToList();

                    StringBuilder sb = new StringBuilder();
                    string        currentDescription = null;
                    if (concept.Description.TryGetValue("et-EE", out currentDescription))
                    {
                        sb.Append(currentDescription);
                    }

                    string fourSpaces = "";
                    if (literals.Count > 1)
                    {
                        sb.Append("+ ");
                        sb.AppendLine(name);
                        fourSpaces = "    ";
                    }
                    if (!string.IsNullOrWhiteSpace(definition))
                    {
                        sb.Append(fourSpaces);
                        sb.Append("- ");
                        sb.AppendLine(definition);
                    }
                    foreach (var example in examples)
                    {
                        sb.Append(fourSpaces);
                        sb.Append("- ");
                        sb.AppendLine(example);
                    }

                    concept.Description["et-EE"] = sb.ToString();
                }

                var wordnetSynonyms = synSet.Children.Where(x => x.Token == EQ_LINKS)
                                      .SelectMany(x => x.Children.Where(y => y.ValueUnquoted == "eq_synonym" || y.ValueUnquoted == "eq_near_synonym"))
                                      .SelectMany(a => a.Children.Where(b => b.Token == TARGET_ILI)).ToList();

                foreach (var ili in wordnetSynonyms)
                {
                    if (ili.Children[1].Token != WORDNET_OFFSET)
                    {
                        continue;
                    }

                    string partOfSpeechRef = ili.Children[0].ValueUnquoted;
                    string wordnetOffset   = ili.Children[1].ValueUnquoted;
                    concept.UserIds.Add(new UserId("wordnet15", partOfSpeechRef + wordnetOffset));
                    concept.UserAttributes.Add(new UserAttribute()
                    {
                        Key   = "x:wordnet15PartOfSpeech",
                        Value = partOfSpeechRef
                    });
                    concept.UserAttributes.Add(new UserAttribute()
                    {
                        Key   = "x:wordnet15WordnetOffset",
                        Value = wordnetOffset
                    });
                }
            }

            synSets = null;

            Dictionary <string, WordnetEntry> pairs = new Dictionary <string, WordnetEntry>();

            WordnetParser.AddToDictionary(Path.Combine(distDir, "DICT", "ADJ.DAT"), pairs);
            WordnetParser.AddToDictionary(Path.Combine(distDir, "DICT", "ADV.DAT"), pairs);
            WordnetParser.AddToDictionary(Path.Combine(distDir, "DICT", "VERB.DAT"), pairs);
            WordnetParser.AddToDictionary(Path.Combine(distDir, "DICT", "NOUN.DAT"), pairs);

            foreach (var concept in concepts.Values)
            {
                List <string> wordnetIds = concept.UserIds.Where(y => y.Type == "wordnet15").Select(x => x.Identifier).ToList();

                WordnetEntry entry = null;
                foreach (var wordnetId in wordnetIds)
                {
                    if (pairs.TryGetValue(wordnetId, out entry))
                    {
                        concept.ItemName["en-US"] = entry.Names.First();
                        concept.Label["en-US"]    = string.Join(", ", entry.Names);
                        if (!string.IsNullOrWhiteSpace(entry.Description))
                        {
                            concept.Description["en-US"] = entry.Description;
                        }
                    }
                    else
                    {
                        throw new InvalidOperationException("Unknown wordnet id");
                    }
                }
            }


            // has_hyperonym for parent
            var relations = File.ReadAllLines(Path.Combine(distDir, "kb73-utf8.rix"), Encoding.UTF8);

            foreach (var relation in relations)
            {
                var parts = relation.Split(':');

                var subjectId          = parts[0];
                var predicate          = parts[2];
                var objectPartOfSpeech = parts[3];
                var objectLiteral      = parts[4];
                var objectSense        = parts[5];

                if (predicate != "has_hyperonym")
                {
                    continue;
                }

                string objectId;
                if (literalToId.TryGetValue($"{objectPartOfSpeech}:{objectLiteral}:{objectSense}", out objectId))
                {
                    var subjectConcept = concepts[subjectId];
                    var objectConcept  = concepts[objectId];
                    subjectConcept.SubclassOf.Add(objectConcept);
                }
                else
                {
                    throw new InvalidOperationException("Unknown literal");
                }
            }


            FragmentInstance instance = new FragmentInstance();

            instance.Items.Merge(concepts.Values);
            WriteFragment(Path.Combine(currentDir, "estonian-wordnet.ddi32.xml"), instance);

            // compress the DDI since it is large
            using (FileStream fs = new FileStream(Path.Combine(currentDir, "estonian-wordnet.ddi32.xml.zip"), FileMode.Create))
                using (ZipArchive archive = new ZipArchive(fs, ZipArchiveMode.Create))
                {
                    archive.CreateEntryFromFile(Path.Combine(currentDir, "estonian-wordnet.ddi32.xml"), "estonian-wordnet.ddi32.xml");
                }

            File.WriteAllLines(Path.Combine(currentDir, "idmapping.txt"), idMappingLines, Encoding.UTF8);
        }