private void getRelatedSynSets_Click(object sender, EventArgs e) { SynSet selectedSynSet = synSets.SelectedItem as SynSet; if (selectedSynSet == null || semanticRelations.SelectedIndex == -1) return; synSets.Items.Clear(); // get relations string relationStr = semanticRelations.SelectedItem.ToString(); relationStr = relationStr.Split(':')[0].Trim(); WordNetEngine.SynSetRelation relation = (WordNetEngine.SynSetRelation)Enum.Parse(typeof(WordNetEngine.SynSetRelation), relationStr); // add related synset foreach (SynSet relatedSynset in selectedSynSet.GetRelatedSynSets(relation, false)) synSets.Items.Add(relatedSynset); // selected synset if (synSets.Items.Count > 0) synSets.SelectedIndex = 0; }
/// <summary> /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset /// members initialized. These members are enough to look up the full synset within the corresponding data file. This /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the /// corresponding parameter. /// </summary> /// <param name="wordIndexLine">Word index line from which to get synset shells</param> /// <param name="pos">POS of the given index line</param> /// <param name="mostCommonSynSet">Returns the most common synset for the word</param> /// <param name="wordNet">The WordNet instance</param> /// <returns>Synset shells for the given index line</returns> /// <exception cref="System.Exception">Failed to get most common synset</exception> internal static List <SynSet> GetSynSetShells(string wordIndexLine, WordNetPos pos, out SynSet mostCommonSynSet, WordNet wordNet) { var synsets = new List <SynSet>(); mostCommonSynSet = null; // get number of synsets var parts = wordIndexLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); var numSynSets = int.Parse(parts[2]); // grab each synset shell, from last to first int firstOffsetIndex = parts.Length - numSynSets; for (int i = parts.Length - 1; i >= firstOffsetIndex; --i) { // create synset int offset = int.Parse(parts[i]); // add synset to collection var synset = new SynSet(pos, offset, wordNet); synsets.Add(synset); // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset if (i == firstOffsetIndex) { mostCommonSynSet = synset; } } if (mostCommonSynSet == null) { throw new Exception("Failed to get most common synset"); } return(synsets); }
protected override SynSet SelectSynset(string word, POS pos) { SynSet result = base.SelectSynset(word, pos); // temporary WordNetEngine.POS wordnetPos = pos.ForWordnet(); if (wordnetPos != WordNetEngine.POS.None) { IGlossaryEntry glossEntry = glossary.FindWord(word); if (glossEntry == null) { Set <SynSet> synsets = wordnet.GetSynSets(word, wordnetPos); foreach (SynSet synset in synsets) { // great algorythms will be added here } } else { result = glossEntry.Synset; } } return(result); }
/// <summary> /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset /// members initialized. These members are enough to look up the full synset within the corresponding data file. This /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the /// corresponding parameter. /// </summary> /// <param name="wordIndexLine">Word index line from which to get synset shells</param> /// <param name="pos">POS of the given index line</param> /// <param name="mostCommonSynSet">Returns the most common synset for the word</param> /// <param name="wordNet">The WordNet instance</param> /// <returns>Synset shells for the given index line</returns> /// <exception cref="System.Exception">Failed to get most common synset</exception> internal static List<SynSet> GetSynSetShells(string wordIndexLine, WordNetPos pos, out SynSet mostCommonSynSet, WordNet wordNet) { var synsets = new List<SynSet>(); mostCommonSynSet = null; // get number of synsets var parts = wordIndexLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); var numSynSets = int.Parse(parts[2]); // grab each synset shell, from last to first int firstOffsetIndex = parts.Length - numSynSets; for (int i = parts.Length - 1; i >= firstOffsetIndex; --i) { // create synset int offset = int.Parse(parts[i]); // add synset to collection var synset = new SynSet(pos, offset, wordNet); synsets.Add(synset); // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset if (i == firstOffsetIndex) mostCommonSynSet = synset; } if (mostCommonSynSet == null) throw new Exception("Failed to get most common synset"); return synsets; }
private void findLCS_Click(object sender, EventArgs e) { int found = 0; LinkBox.Items.Clear(); // retrive synsets Set <SynSet> synStartSet = null; try { synStartSet = _wordNetEngine.GetSynSets(StartWord.Text, (WordNetEngine.POS)pos.SelectedItem); } catch (Exception) { MessageBox.Show("Invalid Start SynSet ID"); return; } Set <SynSet> synDestSet = null; try { synDestSet = _wordNetEngine.GetSynSets(DestWord.Text, (WordNetEngine.POS)pos.SelectedItem); } catch (Exception) { MessageBox.Show("Invalid Dest SynSet ID"); return; } if (synStartSet.Count > 0) { WordNetEngine.SynSetRelation[] vlist = new WordNetEngine.SynSetRelation[1]; vlist[0] = WordNetEngine.SynSetRelation.Hypernym; //vlist[1] = WordNetEngine.SynSetRelation.InstanceHypernym; //vlist[2] = WordNetEngine.SynSetRelation.Hyponym; //vlist[3] = WordNetEngine.SynSetRelation.InstanceHyponym; foreach (SynSet synSrcSet in synStartSet) { foreach (SynSet synDstSet in synDestSet) { //synSets.Items.Add(synSet); List <SynSet> linkageList = null; linkageList = synSrcSet.GetShortestPathTo(synDstSet, vlist); SynSet s = synSrcSet.GetClosestMutuallyReachableSynset(synDstSet, vlist); if (s != null) { StringBuilder desc = new StringBuilder(); desc.Append("{"); bool prependComma = false; foreach (string word in s.Words) { desc.Append((prependComma ? ", " : "") + word); prependComma = true; } desc.Append("}"); LinkBox.Items.Add(desc.ToString()); LinkBox.Text = desc.ToString(); found++; //return; } } } if (found == 0) { LinkBox.Text = "false"; } } else { LinkBox.Text = "false"; // MessageBox.Show("No synsets found"); } }
public SynsetWithGloss(SynSet synSet, string gloss, int index) { SynSet = synSet; Gloss = gloss; Index = index; }
private static MyWordInfo[] LookupCandidates(Index index, MyWordInfo pos) { if (pos.Sense < 0) { pos.Sense = 1; } SynSet synset = new Wnlib.SynSet(index.SynsetOffsets[pos.Sense - 1], index.PartOfSpeech, index.Wd, null, pos.Sense - 1); ArrayList lexemes = new ArrayList(); ArrayList synIndex = new ArrayList(); foreach (Lexeme obj in synset.words) { lexemes.Add(obj); synIndex.Add(index.SynsetOffsets[pos.Sense - 1]); } if (index.SynsetOffsets.Length > 1) { if (lexemes.Count <= 1) { for (int i = 0; i < index.SynsetOffsets.Length; i++) { synset = new SynSet(index.SynsetOffsets[i], index.PartOfSpeech, index.Wd, null, i); foreach (Lexeme obj in synset.words) { synIndex.Add(index.SynsetOffsets[i]); lexemes.Add(obj); } } } else { synset = new SynSet(index.SynsetOffsets[0], index.PartOfSpeech, index.Wd, null, 0); int count = 0; //get top most frequency word senses foreach (Lexeme obj in synset.words) { lexemes.Add(obj); synIndex.Add(index.SynsetOffsets[0]); ++count; if (count > 4) { break; } } } } ArrayList sortedSet = new ArrayList(); Hashtable trace = new Hashtable(); int hasSem = 0; for (int i = 0; i < lexemes.Count; i++) { Lexeme word = (Lexeme)lexemes[i]; word.word = word.word.ToLower(); int senIndex = (int)synIndex[i]; if (senIndex != -1 && word.wnsns > 0) { word.semcor = new Wnlib.SemCor(word, senIndex); lexemes[i] = word; ++hasSem; } if (!trace.ContainsKey(word.word)) { if ((word.semcor != null && word.semcor.semcor > 0) || (hasSem < 4)) { trace[word.word] = 1; sortedSet.Add(word); } } //catch {} } var words = (Lexeme[])sortedSet.ToArray(typeof(Lexeme)); ArrayList candidates = new ArrayList(); for (int i = 0; i < words.Length; i++) { string word = words[i].word.Replace("_", " "); if (word[0] <= 'Z') { continue; } MyWordInfo newpos = new MyWordInfo(word, pos.Pos); newpos.Sense = words[i].wnsns; if (words[i].semcor != null) { newpos.Frequency = words[i].semcor.semcor; } else { newpos.Frequency = 0; } candidates.Add(newpos); } if (!trace.ContainsKey(index.Wd)) { candidates.Add(pos); } if (candidates.Count > 1) { CompareLexeme comparer = new CompareLexeme(); candidates.Sort(comparer); } return((MyWordInfo[])candidates.ToArray(typeof(MyWordInfo))); }
private void FillSenses(ParseTree parsetree, ParseNode node, ref ArrayList wordinfoArr, ref int j) { if (node.Children != null) { for (int i = 0; i < node.Children.Count; i++) { ParseNode pn = (ParseNode)node.Children[i]; FillSenses(parsetree, pn, ref wordinfoArr, ref j); } } else { SentenceParser dummysp = new SentenceParser(); string str = node.Goal; if (str == "PPN") { WordSenseDisambiguator wsd = new WordSenseDisambiguator(); double score; double num; if (double.TryParse(node.Text, out num)) { MyWordInfo[] ret = wsd.MMG_Disambiguate(new MyWordInfo[] { new MyWordInfo("Number", PartsOfSpeech.Noun) }, out score); PartOfSpeech pos = PartOfSpeech.of(PartsOfSpeech.Noun); Index index = Wnlib.Index.lookup(ret[0].Word.ToLower(), pos); SynSet sense = new SynSet(index, ret[0].Sense, null); node.Sense = sense.defn; node.SenseNo = ret[0].Sense; } else { MyWordInfo[] ret = wsd.MMG_Disambiguate(new MyWordInfo[] { new MyWordInfo("proper_name", PartsOfSpeech.Noun) }, out score); PartOfSpeech pos = PartOfSpeech.of(PartsOfSpeech.Noun); Index index = Wnlib.Index.lookup(ret[0].Word.ToLower(), pos); SynSet sense = new SynSet(index, ret[0].Sense, null); node.Sense = sense.defn; node.SenseNo = ret[0].Sense; } } else if (str == "N" || str.Contains("NPP") || (str == "VING") || str.Contains("PPJ") || (str == "VPSP") || (str == "BE1") || (str == "BE2") || (str == "V") || str.Contains("CPADJ") || str.Contains("ADJ") || str.Contains("PADV") || str.Contains("ADV") || str == "VINF") { ///ta3deelat 5/7///////// string [] spltstr = DisambRes[j].ToString().Split(':'); VerbSense VS = new VerbSense(); string[] spltstr2 = new string[10]; string[] spltstr3 = new string[10]; if (node.Senses != null && node.Goal.Contains("V")) { VS = (VerbSense)node.Senses[0]; spltstr2 = VS.Sense.ToString().Split('#'); spltstr3 = spltstr[0].Split(' '); ArrayList results = dummysp.GetINFOfVerb(spltstr2[0]); if (results.Count > 0) { spltstr2[0] = (string)results[0]; } if (spltstr3[0] == spltstr2[0]) { if (NodesSenses.Count > 0) { node.Sense = (string)NodesSenses[0]; node.SenseNo = (int)SensesNos[0]; SensesNos.RemoveAt(0); NodesSenses.RemoveAt(0); j++; } } } if (str == "N") { node.Sense = (string)NodesSenses[0]; node.SenseNo = (int)SensesNos[0]; SensesNos.RemoveAt(0); NodesSenses.RemoveAt(0); j++; } else { string NodeWord = SyntacticAnalyzer.SentenceParser.GetWordString(parsetree, node); spltstr3 = spltstr[0].Split(' '); string nodeWord = NodeWord.ToLower(); if (node.Goal == "V" || node.Goal == "BE1" || node.Goal == "VINF" || node.Goal == "VPSP" || node.Goal == "VING") { ArrayList results = dummysp.GetINFOfVerb(nodeWord); if (results.Count > 0) { nodeWord = (string)results[0]; } } string dummy = spltstr3[0]; dummy = dummy.Remove(dummy.Length - 1); dummy = dummy + "ies"; string dummy2 = nodeWord + "ing"; string dummy3 = dummy2 + "s"; string dummy4 = nodeWord.Remove(nodeWord.Length - 1); dummy4 = dummy4 + "ing"; if (spltstr3[0] == nodeWord || spltstr3[0] + 's' == nodeWord || spltstr3[0] + "es" == nodeWord || dummy == nodeWord || spltstr3[0] == dummy2 || spltstr3[0] == dummy3 || spltstr3[0] == dummy4 || spltstr3[0] == dummy4 + 's') { if (NodesSenses.Count > 0) { node.Sense = (string)NodesSenses[0]; node.SenseNo = (int)SensesNos[0]; SensesNos.RemoveAt(0); NodesSenses.RemoveAt(0); j++; } } } } } }
public void beginDisambiguate() { Disambiguate(SParseTrees); ///////////////////////////get the text of senses /////////////////////// for (int i = 0; i < NewParseTreeSenses.Count; i++) { MyWordInfo[] mwiArr = (MyWordInfo[])NewParseTreeSenses[i]; ParseTree pt; pt = (ParseTree)NewSParseTrees[i]; AddArrStems(NewSParseTrees); for (int j = 0; j < mwiArr.Length; j++) { Wnlib.PartOfSpeech p = Wnlib.PartOfSpeech.of((Wnlib.PartsOfSpeech)mwiArr[j].Pos); try { ArrayList results = new ArrayList(); Wnlib.Index index; //i need the stems here to get index if (mwiArr[j].Pos == PartsOfSpeech.Verb) { SentenceParser dummysp = new SentenceParser(); results = dummysp.GetINFOfVerb(mwiArr[j].Word.ToLower()); if (results.Count > 0) { index = Wnlib.Index.lookup((string)results[0], p); } else { index = Wnlib.Index.lookup(mwiArr[j].Word.ToLower(), p); } } else { index = Wnlib.Index.lookup(mwiArr[j].Word.ToLower(), p); } SynSet sense = new SynSet(index, mwiArr[j].Sense, null); NodesSenses.Add(sense.defn); SensesNos.Add(mwiArr[j].Sense); string s; if (results.Count > 0) { s = (string)results[0] + " : " + sense.defn; } else { s = mwiArr[j].Word.ToLower() + " : " + sense.defn; } DisambRes.Add(s); } catch { try { Wnlib.Index index = Wnlib.Index.lookup(Stems[j], p); SynSet sense = new SynSet(index, mwiArr[j].Sense, null); NodesSenses.Add(sense.defn); SensesNos.Add(mwiArr[j].Sense); string s = Stems[j].ToLower() + " : " + sense.defn; DisambRes.Add(s); } catch { }; }; } Senses = NodesSenses; } //////////////////////////add sense text & sense no to the nodes////////////////////////////// AddNodesSenses(NewSParseTrees); //////////////////////////put the output parsetrees in SparseTree again////////////////////////////// SParseTrees = NewSParseTrees; }
private SynSet Disambiguate(List <SynSet> Senses, string[] context) { int synSize = Senses.Count; List <SynSet> RelatedSenses; SynSet tmpSense = null; // Temp variable string senseData; // Traking the sense that maximizes the overlap score int overlap; int score = 0; //Rada Recommendation //if (synSize > 3) // synSize = 3; if (synSize > 0) { for (int k = 0; k < synSize; k++) { senseData = string.Join(" ", Senses[k].Synonyms) + " " + Senses[k].Gloss; senseData = senseData.Replace("_", " ").Replace("-", " "); overlap = Intersect(_tokenizer.Tokenize(senseData), context) * 60; // strong relations senseData = ""; RelatedSenses = Senses[k].GetRelatedSynSets(WordNetApi.Core.WordNetEngine.SynSetRelation.Hypernym, false); if (RelatedSenses.Count > 0) { foreach (SynSet syn in RelatedSenses) { senseData = senseData + string.Join(" ", syn.Synonyms) + " " + syn.Gloss; } } RelatedSenses = Senses[k].GetRelatedSynSets(WordNetApi.Core.WordNetEngine.SynSetRelation.Hyponym, false); if (RelatedSenses.Count > 0) { foreach (SynSet syn in RelatedSenses) { senseData = senseData + string.Join(" ", syn.Synonyms) + " " + syn.Gloss; } } overlap = overlap + Intersect(_tokenizer.Tokenize(senseData), context) * 20; // weak relations senseData = ""; RelatedSenses = Senses[k].GetRelatedSynSets(WordNetApi.Core.WordNetEngine.SynSetRelation.PartHolonym, false); if (RelatedSenses.Count > 0) { foreach (SynSet syn in RelatedSenses) { senseData = senseData + string.Join(" ", syn.Synonyms) + " " + syn.Gloss; } } RelatedSenses = Senses[k].GetRelatedSynSets(WordNetApi.Core.WordNetEngine.SynSetRelation.PartMeronym, false); if (RelatedSenses.Count > 0) { foreach (SynSet syn in RelatedSenses) { senseData = senseData + string.Join(" ", syn.Synonyms) + " " + syn.Gloss; } } RelatedSenses = Senses[k].GetRelatedSynSets(WordNetApi.Core.WordNetEngine.SynSetRelation.InstanceHypernym, false); if (RelatedSenses.Count > 0) { foreach (SynSet syn in RelatedSenses) { senseData = senseData + string.Join(" ", syn.Synonyms) + " " + syn.Gloss; } } RelatedSenses = Senses[k].GetRelatedSynSets(WordNetApi.Core.WordNetEngine.SynSetRelation.InstanceHyponym, false); if (RelatedSenses.Count > 0) { foreach (SynSet syn in RelatedSenses) { senseData = senseData + string.Join(" ", syn.Synonyms) + " " + syn.Gloss; } } overlap = overlap + Intersect(_tokenizer.Tokenize(senseData), context) * 5; if (overlap > score) { score = overlap; tmpSense = Senses[k]; } } } if (tmpSense == null) { tmpSense = Senses[0]; } return(tmpSense); }
public String QueryExpansion(IEnumerable <String> tokens, Boolean checkForNoun, Boolean checkForAdj, Boolean checkForVerb, Boolean checkForAdverb) { String expansion = ""; foreach (String token in tokens) { bool isToken = false; foreach (string sw in STOP_WORDS) { if (sw.Contains(token)) { isToken = true; } } if (token == "" || token == " " || isToken == true) { System.Console.WriteLine("Token: {0} : {0}", token, isToken); continue; } System.Console.WriteLine("\n\n~~~~~~~~~~~~~~~~~~~~~~~~ Getting synonyms from WordNet\n"); System.Console.WriteLine("Synonyms for token = " + token + "\n"); expansion += token + "^5 "; // For each token, add synonym // src: https://developer.syn.co.in/api/Syn.WordNet.WordNetEngine.html SynSet synSet_noun = wordNetEngine.GetMostCommonSynSet(token, PartOfSpeech.Noun); SynSet synSet_adjective = wordNetEngine.GetMostCommonSynSet(token, PartOfSpeech.Adjective); SynSet synSet_verb = wordNetEngine.GetMostCommonSynSet(token, PartOfSpeech.Verb); SynSet synSet_adverb = wordNetEngine.GetMostCommonSynSet(token, PartOfSpeech.Adverb); System.Console.WriteLine("~~~~~~~ Nouns\n"); if (checkForNoun) { try { foreach (var noun_syn in synSet_noun.Words) { if (!expansion.Contains(noun_syn)) { expansion += noun_syn + "^1 "; System.Console.WriteLine(noun_syn); } } } catch (NullReferenceException e) { // Noun does not contain synonym } } System.Console.WriteLine("~~~~~~~ Adjectives\n"); if (checkForAdj) { try { foreach (var adj_syn in synSet_adjective.Words) { if (!expansion.Contains(adj_syn)) { expansion += adj_syn + "^1 "; System.Console.WriteLine(adj_syn); } } } catch (NullReferenceException e) { // Noun does not contain adjectives } } System.Console.WriteLine("~~~~~~~ Verbs\n"); if (checkForVerb) { try { foreach (var adj_verb in synSet_verb.Words) { if (!expansion.Contains(adj_verb)) { expansion += adj_verb + "^1 "; System.Console.WriteLine(adj_verb); } } } catch (NullReferenceException e) { // Noun does not contain adjectives } } System.Console.WriteLine("~~~~~~~ Adverbs\n"); if (checkForAdverb) { try { foreach (var adj_adverb in synSet_adverb.Words) { if (!expansion.Contains(adj_adverb)) { expansion += adj_adverb + "^1 "; System.Console.WriteLine(adj_adverb); } } } catch (NullReferenceException e) { // Noun does not contain adjectives } } System.Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~ \n\n"); } return(expansion); }
public Comparison(string tag, double value, SynSet highSynSet = null) { Tag = tag; Value = value; HighSynSet = highSynSet; }
/// <summary> /// Convert method creates db file from wndb files /// dictpath - path to wndb data files /// context - dest db context /// </summary> public static void Convert(string dictPack, string jsonFile) { WNDB wndb = new WNDB(dictPack); var poses = (new[] { "n", "v", "a", "r" }).Select(s => PartOfSpeech.of(s)); // Convert to Dictionary // lemma -> { SynSetGroup: PosSymbol, Synsets = { synset: synonims, definitions, examples } } var dict = new Dictionary <string, List <ExpSynSetGroup> >(); foreach (var pos in poses) { Console.WriteLine("Process Data of {0}", pos.name); foreach (var data in wndb.GetData(pos)) { //data.adj includes 'a' & 's' pos symbols char posSymbol = pos.symbol.First(); bool singleWord = false; if (data.origWords.Count() == 1) { var w = data.origWords.First().word; singleWord = w == w.ToLower(); } var synSet = new SynSet { // Skip synonims if where is a single lowercase word Synonims = (singleWord) ? null : data.origWords.Select(ow => ow.word).ToArray(), Definition = (data.definitions.Count() == 1) ? data.definitions.First() : null, Definitions = (data.definitions.Count() > 1) ? data.definitions : null, Example = (data.examples?.Count() == 1) ? data.examples.First() : null, Examples = (data.examples?.Count() > 1) ? data.examples : null }; foreach (var lemma in data.origWords.Select(ow => ow.word.ToLower())) { var synGrps = dict.GetValue(lemma); if (synGrps != null) { var grp = synGrps.FirstOrDefault(g => g.PosSymbol == posSymbol); if (grp == null) { synGrps.Add(new ExpSynSetGroup(posSymbol, synSet)); } else { grp.Synsets.Add(synSet); } } else { dict.Add(lemma, new List <ExpSynSetGroup> { new ExpSynSetGroup(posSymbol, synSet) }); } } } } // exceptions //TODO: remove morphes, ... var excepts = new Dictionary <string, List <DictException> >(); foreach (var pos in poses) { Console.WriteLine("Process Exceptions of {0}", pos.name); foreach (var exwords in wndb.GetExceptions(pos)) { var morph = Morph.GetBasicForm(exwords[0], pos); for (int i = 1; i < exwords.Length; i++) { var baseForm = exwords[i]; if (baseForm == exwords[0] || baseForm == morph) { //Console.WriteLine($"Skip: {(exwords[0])} -> {baseForm}/{morph}"); continue; } List <ExpSynSetGroup> synGrps = dict.GetValue(baseForm); if (synGrps == null && baseForm.Contains('-')) { baseForm = baseForm.Replace('-', ' '); dict.TryGetValue(baseForm, out synGrps); } if (synGrps != null) { var posSymbols = string.Join("", synGrps.Select(sg => sg.PosSymbol)); var except = new DictException { BasicForm = baseForm, PosSymbols = posSymbols }; List <DictException> baseForms; if (excepts.TryGetValue(exwords[0], out baseForms)) { if (!baseForms.Any(e => e.BasicForm == baseForm)) { baseForms.Add(except); } } else { excepts.Add(exwords[0], new List <DictException> { except }); } } } } } Console.WriteLine("Save changes"); var storage = new ExpDictStorage { SynSets = dict, Exceptions = excepts }; var serializer = new JsonSerializer(); serializer.NullValueHandling = NullValueHandling.Ignore; using (var stream = File.Open(jsonFile, FileMode.Create)) using (var writer = new BsonWriter(stream)) { serializer.Serialize(writer, storage); } }
private void MapConceptsWithMapLex(string concept, Property maplexProperty) { MyWordInfo mwi; WordOlogy WO = new WordOlogy(); List <MyWordInfo> maplexsenses = new List <MyWordInfo>(); int NoOfSensesSucceeded = 0; for (int i = 0; i < maplexProperty.Fillers.Count; i++) { string tmp = maplexProperty.Fillers[i].ScalarFiller; char[] charr = new char[] { '-', '_' }; string[] splt = tmp.Split(charr); //there r fillers with no type & a-bomb masalan if (splt.Length > 1) { mwi = new MyWordInfo(); for (int k = 0; k < splt.Length - 2; k++) { mwi.Word += splt[k] + " "; } mwi.Word += splt[splt.Length - 2]; if (splt[splt.Length - 1].Length == 2) { if (splt[splt.Length - 1][0] == 'v') { mwi.Pos = Wnlib.PartsOfSpeech.Verb; } else if (splt[splt.Length - 1][0] == 'n') { mwi.Pos = Wnlib.PartsOfSpeech.Noun; } else if (splt[splt.Length - 1][0] == 'a') { mwi.Pos = Wnlib.PartsOfSpeech.Adj; } else if (splt[splt.Length - 1][0] == 'r') { mwi.Pos = Wnlib.PartsOfSpeech.Adv; } else { mwi.Pos = Wnlib.PartsOfSpeech.Unknown; } } else { mwi.Pos = Wnlib.PartsOfSpeech.Unknown; mwi.Word += " " + splt[splt.Length - 1]; } if (i == 0 || (maplexsenses.Count > 0 && (mwi.Word != maplexsenses[maplexsenses.Count - 1].Word || mwi.Pos != maplexsenses[maplexsenses.Count - 1].Pos))) { maplexsenses.Add(mwi); } } //ne loop 3al ontology kolaha } if (maplexsenses.Count > 0) { MyWordInfo[] maplexArray = new MyWordInfo[maplexsenses.Count]; for (int j = 0; j < maplexsenses.Count; j++) { maplexArray[j] = maplexsenses[j]; } WordSenseDisambiguator wsd = new WordSenseDisambiguator(); MyWordInfo[] res = new MyWordInfo[maplexArray.Length]; res = wsd.Disambiguate(maplexArray); int i = 0; foreach (MyWordInfo wi in res) { string tmp = maplexProperty.Fillers[i].ScalarFiller; char[] charr = new char[] { '-', '_' }; string[] splt = tmp.Split(charr); if (splt.Length > 1 && splt[splt.Length - 1].Length == 2) { WO.SenseNo = splt[splt.Length - 1]; } else { // "sense doesn't have POS"; } Wnlib.PartOfSpeech p = Wnlib.PartOfSpeech.of((Wnlib.PartsOfSpeech)wi.Pos); try { Wnlib.Index index = Wnlib.Index.lookup(wi.Word.ToLower(), p); SynSet sense = new SynSet(index, res[i].Sense, null); WO.Sense = sense.defn; // AllSensesMapped++; NoOfSensesSucceeded++; try { WO.Pos = p.name; } catch { WO.Pos = wi.Pos.ToString(); } ID++; WO.Word = wi.Word; WO.ID = ID; WO.Concept = concept; wordologyArr.Add(WO); } catch { }; if (NoOfSensesSucceeded == 0) { CannotGetSenseExeption++; } i++; //bf.Serialize(fs, "\n" + WO); } conceptcounter++; } }
private void MapConceptsWithOutMapLex(string concept) { Wnlib.Index index; Wnlib.PartOfSpeech p; Search se; WordOlogy WO = new WordOlogy(); int NoOfSensesSucceeded = 0; try { index = Wnlib.Index.lookup(concept.ToLower(), PartOfSpeech.of(PartsOfSpeech.Noun)); if (index != null) { WO.Pos = "noun"; Opt[] relatedness = WordsMatching.Relatedness.GetRelatedness(PartsOfSpeech.Noun); foreach (Opt o in relatedness) { for (int senseNumber = 0; senseNumber < index.sense_cnt; senseNumber++) { se = new Search(concept, true, PartOfSpeech.of("noun"), o.sch, senseNumber); SynSet sense = new SynSet(index, senseNumber, se); WO.Concept = concept; WO.Word = concept; WO.Sense = sense.defn; WO.ID = ID; ID++; NoOfSensesSucceeded++; //AllSensesMapped++; //bf.Serialize(fs, "\n" + WO); wordologyArr.Add(WO); } } } } catch { } try { index = Wnlib.Index.lookup(concept.ToLower(), PartOfSpeech.of(PartsOfSpeech.Verb)); if (index != null) { WO.Pos = "verb"; Opt[] relatedness = WordsMatching.Relatedness.GetRelatedness(PartsOfSpeech.Verb); foreach (Opt o in relatedness) { for (int senseNumber = 0; senseNumber < index.sense_cnt; senseNumber++) { se = new Search(concept, true, PartOfSpeech.of("verb"), o.sch, senseNumber); SynSet sense = new SynSet(index, senseNumber, se); WO.Sense = sense.defn; WO.Concept = concept; WO.Word = concept; WO.ID = ID; ID++; NoOfSensesSucceeded++; // AllSensesMapped++; //bf.Serialize(fs, "\n" + WO); wordologyArr.Add(WO); } } } } catch { } try { index = Wnlib.Index.lookup(concept.ToLower(), PartOfSpeech.of(PartsOfSpeech.Adj)); if (index != null) { WO.Pos = "adj"; Opt[] relatedness = WordsMatching.Relatedness.GetRelatedness(PartsOfSpeech.Adj); foreach (Opt o in relatedness) { for (int senseNumber = 0; senseNumber < index.sense_cnt; senseNumber++) { se = new Search(concept, true, PartOfSpeech.of("adj"), o.sch, senseNumber); SynSet sense = new SynSet(index, senseNumber, se); WO.Sense = sense.defn; WO.Concept = concept; WO.Word = concept; WO.ID = ID; ID++; NoOfSensesSucceeded++; //AllSensesMapped++; //bf.Serialize(fs, "\n" + WO); wordologyArr.Add(WO); } } } } catch { } try { index = Wnlib.Index.lookup(concept.ToLower(), PartOfSpeech.of(PartsOfSpeech.Adv)); if (index != null) { WO.Pos = "adv"; Opt[] relatedness = WordsMatching.Relatedness.GetRelatedness(PartsOfSpeech.Noun); foreach (Opt o in relatedness) { for (int senseNumber = 0; senseNumber < index.sense_cnt; senseNumber++) { se = new Search(concept, true, PartOfSpeech.of("adv"), o.sch, senseNumber); SynSet sense = new SynSet(index, senseNumber, se); WO.Sense = sense.defn; WO.Concept = concept; WO.Word = concept; WO.ID = ID; ID++; NoOfSensesSucceeded++; //AllSensesMapped++; //bf.Serialize(fs, "\n" + WO); wordologyArr.Add(WO); } } } } catch { } if (NoOfSensesSucceeded != 0) { conceptcounter++; } }
/// <summary> /// Convert method creates db file from wndb files /// dictpath - path to wndb data files /// context - dest db context /// </summary> public static void Convert(string dictPack, WordNetContext context) { WNDB wndb = new WNDB(dictPack); var synWords = new List <string>(); // int ind; var wordToLemma = new Dictionary <string, Lemma>(); var words = new Dictionary <string, Writing>(); var poses = (new [] { "n", "v", "a", "r" }).Select(s => PartOfSpeech.of(s)); foreach (var pos in poses) { Console.WriteLine("Process Data of {0}", pos.name); // ind = 0; foreach (var data in wndb.GetData(pos)) { if (data.pos != pos.symbol && !(data.pos == "s" && pos.symbol == "a")) //data.adj includes 'a' & 's' pos symbols { throw new Exception("pos!=data.pos"); } var synset = new SynSet { Pos = data.pos }; context.SynSets.Add(synset); synWords.Clear(); foreach (var oword in data.origWords) { Lemma lemma; string lcWord = oword.word.ToLower(); // add lemma if (!wordToLemma.TryGetValue(lcWord, out lemma)) { lemma = new Lemma { Value = lcWord, Poses = data.pos }; wordToLemma.Add(lcWord, lemma); context.Lemmas.Add(lemma); } else if (!lemma.Poses.Contains(data.pos)) { lemma.Poses += data.pos; } if (synWords.IndexOf(lcWord) < 0) { synWords.Add(lcWord); // add SynSet <-> Lemma relation context.SynsetLemmas.Add(new SynsetLemma { SynSet = synset, Lemma = lemma }); } // add original word if it differs from lemma Writing word; if (lcWord != oword.word) { if (!words.TryGetValue(oword.word, out word)) { word = new Writing { Value = oword.word, Lemma = lemma }; words.Add(oword.word, word); context.Writings.Add(word); } else if (word.Lemma != lemma) { Console.WriteLine("Word mix: {0} {1} {2}", oword.word, lemma.Value, word.Lemma.Value); continue; } } } synset.Definition = string.Join(";", data.definitions); synset.Example = string.Join(";", data.examples); // ind++; // if (ind % 1000 == 0) // ShowProgress(ind.ToString()); } Console.WriteLine("Save changes"); context.SaveChanges(); // exceptions //TODO: remove morphes, ... Console.WriteLine("Process Exceptions of {0}", pos.name); // ind = 0; foreach (var exwords in GetExceptions(wndb, pos)) { for (int i = 1; i < exwords.Length; i++) { if (exwords[i] == exwords[0]) { continue; } Lemma lemma; if (wordToLemma.TryGetValue(exwords[i], out lemma) || (exwords[i].Contains('-') && wordToLemma.TryGetValue(exwords[i].Replace('-', ' '), out lemma))) { context.Excepts.Add(new Except { Value = exwords[0], MainForm = exwords[i], Lemma = lemma }); } // else // { // Console.WriteLine("Lemma not found {0}", exwords[i]); // context.Excepts.Add(new Except { Value = exwords[0], MainForm = exwords[i] }); // } } // ind++; // if (ind % 1000 == 0) // ShowProgress(ind.ToString()); } Console.WriteLine("Save changes"); context.SaveChanges(); } //Console.WriteLine("Save changes"); context.SaveChanges(); }
/// <summary> /// Initializes a new instance of the <see cref="WordNetMemoryProvider"/> class. /// </summary> /// <param name="dataPath">The data path.</param> /// <exception cref="System.ArgumentNullException">dataPath</exception> /// <exception cref="System.IO.DirectoryNotFoundException">The data directory does not exist.</exception> /// <exception cref="System.IO.FileNotFoundException">A required WordNet file does not exist: [filename]</exception> public WordNetMemoryProvider(string dataPath) { if (string.IsNullOrEmpty(dataPath)) { throw new ArgumentNullException("dataPath"); } var dir = new DirectoryInfo(dataPath); if (!dir.Exists) { throw new DirectoryNotFoundException("The data directory does not exist."); } var dataPaths = new [] { new FileInfo(Path.Combine(dataPath, "data.adj")), new FileInfo(Path.Combine(dataPath, "data.adv")), new FileInfo(Path.Combine(dataPath, "data.noun")), new FileInfo(Path.Combine(dataPath, "data.verb")) }; var indexPaths = new [] { new FileInfo(Path.Combine(dataPath, "index.adj")), new FileInfo(Path.Combine(dataPath, "index.adv")), new FileInfo(Path.Combine(dataPath, "index.noun")), new FileInfo(Path.Combine(dataPath, "index.verb")) }; foreach (var file in dataPaths.Union(indexPaths).Where(file => !file.Exists)) { throw new FileNotFoundException("A required WordNet file does not exist: " + file.Name); } // Pass 1: Get total number of synsets var totalSynsets = 0; foreach (var dataInfo in dataPaths) { // scan synset data file for lines that don't start with a space... // these are synset definition lines using (var dataFile = new StreamReader(dataInfo.FullName)) { string line; while ((line = dataFile.ReadLine()) != null) { var firstSpace = line.IndexOf(' '); if (firstSpace > 0) { ++totalSynsets; } } } } // Pass 2: Create synset shells (pos and offset only) idSynset = new Dictionary <string, SynSet>(totalSynsets); foreach (var dataInfo in dataPaths) { var pos = WordNetFileProvider.GetFilePos(dataInfo.FullName); // scan synset data file using (var dataFile = new StreamReader(dataInfo.FullName)) { string line; while ((line = dataFile.ReadLine()) != null) { var firstSpace = line.IndexOf(' '); if (firstSpace <= 0) { continue; } // get offset and create synset shell var offset = int.Parse(line.Substring(0, firstSpace)); var synset = new SynSet(pos, offset, null); idSynset.Add(synset.Id, synset); } } } // Pass 3: Instantiate synsets (hooks up relations, set glosses, etc.) foreach (var dataInfo in dataPaths) { var pos = WordNetFileProvider.GetFilePos(dataInfo.FullName); // scan synset data file using (var dataFile = new StreamReader(dataInfo.FullName)) { string line; while ((line = dataFile.ReadLine()) != null) { var firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // instantiate synset defined on current line, using the instantiated synsets for all references idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, idSynset); } } } } // organize synsets by pos and words... // also set most common synset for word-pos pairs that have multiple synsets posWordSynSets = new Dictionary <WordNetPos, Dictionary <string, List <SynSet> > >(); foreach (var indexInfo in indexPaths) { var pos = WordNetFileProvider.GetFilePos(indexInfo.FullName); posWordSynSets.EnsureContainsKey(pos, typeof(Dictionary <string, List <SynSet> >)); // scan word index file, skipping header lines using (var indexFile = new StreamReader(indexInfo.FullName)) { string line; while ((line = indexFile.ReadLine()) != null) { var firstSpace = line.IndexOf(' '); if (firstSpace <= 0) { continue; } // grab word and synset shells, along with the most common synset var word = line.Substring(0, firstSpace); SynSet mostCommonSynSet; var synsets = WordNetFileProvider.GetSynSetShells(line, pos, out mostCommonSynSet, wordNet); // set flag on most common synset if it's ambiguous if (synsets.Count > 1) { idSynset[mostCommonSynSet.Id].SetAsMostCommonSynsetFor(word); } // use reference to the synsets that we instantiated in our three-pass routine above posWordSynSets[pos].Add(word, new List <SynSet>(synsets.Count)); foreach (var synset in synsets) { posWordSynSets[pos][word].Add(idSynset[synset.Id]); } } } } }
/*--------------------------------------------------------------------------------------------*/ private static void InsertLexAndSemForSynSet(ISession pSess, string pSynSetId, SynSet pSynSet) { Synset dbSynSet = SynsetCache[pSynSetId]; List <LexicalRelation> lexRels = pSynSet.GetLexicallyRelated(); foreach (LexicalRelation lr in lexRels) { var dbLex = new Lexical(); dbLex.Synset = dbSynSet; dbLex.Word = WordCache[dbLex.Synset.SsId + "|" + lr.FromWord]; dbLex.RelationId = (byte)lr.Relation; dbLex.TargetSynset = SynsetCache[lr.ToSyn.ID]; dbLex.TargetWord = WordCache[dbLex.TargetSynset.SsId + "|" + lr.ToWord]; pSess.Save(dbLex); } foreach (WordNetEngine.SynSetRelation rel in pSynSet.SemanticRelations) { Set <SynSet> relSet = pSynSet.GetRelatedSynSets(rel, false); foreach (SynSet rs in relSet) { var dbSem = new Semantic(); dbSem.Synset = dbSynSet; dbSem.RelationId = (byte)rel; dbSem.TargetSynset = SynsetCache[rs.ID]; pSess.Save(dbSem); } } }
public void ConstructMapping() { string concept = ""; string word = ""; int ID = -1; string senseNo = ""; string Sense = ""; string Pos = ""; WordOlogy WO = new WordOlogy(); ArrayList wordologyArr = new ArrayList(); int conceptcounter = 0; LoadOntology(); FileStream allConceptsFile = new FileStream(_ontologyDirectoryPath + @"\AllConcepts.txt", FileMode.Open); StreamReader allConceptsFileReader = new StreamReader(allConceptsFile); string _wordologyDirectoryPath = @"..\..\..\wordology\"; BinaryFormatter bf = new BinaryFormatter(); FileStream fs = new FileStream( _wordologyDirectoryPath + "\\wordology.txt", FileMode.Create); int indxWatcherconceptCounter = 0; int NoMapLexConcepts = 0; int CannotGetSenseExeption = 0; int AllSensesMapped = 0; while ((concept = allConceptsFileReader.ReadLine()) != null) { indxWatcherconceptCounter++; string Conceptpath = _ontologyDirectoryPath + @"\" + concept[0] + @"\" + concept; Concept C = (Concept)Onto[concept]; Property maplexProperty = C.FullProperties["ENGLISH1"]; List <MyWordInfo> maplexsenses = new List <MyWordInfo>(); MyWordInfo mwi = new MyWordInfo(); int NoOfSensesSucceeded = 0; if (maplexProperty != null) { for (int i = 0; i < maplexProperty.Fillers.Count; i++) { string tmp = maplexProperty.Fillers[i].ScalarFiller; char[] charr = new char[] { '-', '_' }; string[] splt = tmp.Split(charr); //there r fillers with no type & a-bomb masalan if (splt.Length > 1) { mwi = new MyWordInfo(); for (int k = 0; k < splt.Length - 2; k++) { mwi.Word += splt[k] + " "; } mwi.Word += splt[splt.Length - 2]; if (splt[splt.Length - 1].Length == 2) { if (splt[splt.Length - 1][0] == 'v') { mwi.Pos = Wnlib.PartsOfSpeech.Verb; } else if (splt[splt.Length - 1][0] == 'n') { mwi.Pos = Wnlib.PartsOfSpeech.Noun; } else if (splt[splt.Length - 1][0] == 'a') { mwi.Pos = Wnlib.PartsOfSpeech.Adj; } else if (splt[splt.Length - 1][0] == 'r') { mwi.Pos = Wnlib.PartsOfSpeech.Adv; } else { mwi.Pos = Wnlib.PartsOfSpeech.Unknown; } } else { mwi.Pos = Wnlib.PartsOfSpeech.Unknown; mwi.Word += " " + splt[splt.Length - 1]; } if (i == 0 || (maplexsenses.Count > 0 && (mwi.Word != maplexsenses[maplexsenses.Count - 1].Word || mwi.Pos != maplexsenses[maplexsenses.Count - 1].Pos))) { maplexsenses.Add(mwi); } } //ne loop 3al ontology kolaha } if (maplexsenses.Count > 0) { MyWordInfo[] maplexArray = new MyWordInfo[maplexsenses.Count]; for (int j = 0; j < maplexsenses.Count; j++) { maplexArray[j] = maplexsenses[j]; } WordSenseDisambiguator wsd = new WordSenseDisambiguator(); MyWordInfo[] res = new MyWordInfo[maplexArray.Length]; res = wsd.Disambiguate(maplexArray); int i = 0; foreach (MyWordInfo wi in res) { string tmp = maplexProperty.Fillers[i].ScalarFiller; char[] charr = new char[] { '-', '_' }; string[] splt = tmp.Split(charr); if (splt.Length > 1 && splt[splt.Length - 1].Length == 2) { WO.SenseNo = splt[splt.Length - 1]; } else { // "sense doesn't have POS"; } Wnlib.PartOfSpeech p = Wnlib.PartOfSpeech.of((Wnlib.PartsOfSpeech)wi.Pos); try { Wnlib.Index index = Wnlib.Index.lookup(wi.Word.ToLower(), p); SynSet sense = new SynSet(index, res[i].Sense, null); WO.Sense = sense.defn; AllSensesMapped++; NoOfSensesSucceeded++; try { WO.Pos = p.name; } catch { WO.Pos = wi.Pos.ToString(); } ID++; WO.Word = wi.Word; WO.ID = ID; WO.Concept = concept; WO.Word = word; } catch { }; if (NoOfSensesSucceeded == 0) { CannotGetSenseExeption++; } i++; // bf.Serialize(fs, "\n" + WO); wordologyArr.Add(WO); } conceptcounter++; } } else { NoMapLexConcepts++; //new part Wnlib.Index index; Wnlib.PartOfSpeech p; Search se; try { index = Wnlib.Index.lookup(concept.ToLower(), PartOfSpeech.of(PartsOfSpeech.Noun)); if (index != null) { WO.Pos = "noun"; Opt[] relatedness = WordsMatching.Relatedness.GetRelatedness(PartsOfSpeech.Noun); foreach (Opt o in relatedness) { for (int senseNumber = 0; senseNumber < index.sense_cnt; senseNumber++) { se = new Search(concept, true, PartOfSpeech.of("noun"), o.sch, senseNumber); SynSet sense = new SynSet(index, senseNumber, se); WO.Concept = concept; WO.Word = concept; WO.Sense = sense.defn; WO.ID = ID; ID++; NoOfSensesSucceeded++; AllSensesMapped++; //bf.Serialize(fs, "\n" + WO); wordologyArr.Add(WO); } } } } catch { } try { index = Wnlib.Index.lookup(concept.ToLower(), PartOfSpeech.of(PartsOfSpeech.Verb)); if (index != null) { WO.Pos = "verb"; Opt[] relatedness = WordsMatching.Relatedness.GetRelatedness(PartsOfSpeech.Verb); foreach (Opt o in relatedness) { for (int senseNumber = 0; senseNumber < index.sense_cnt; senseNumber++) { se = new Search(concept, true, PartOfSpeech.of("verb"), o.sch, senseNumber); SynSet sense = new SynSet(index, senseNumber, se); WO.Sense = sense.defn; WO.Concept = concept; WO.Word = concept; WO.ID = ID; ID++; NoOfSensesSucceeded++; AllSensesMapped++; //bf.Serialize(fs, "\n" + WO); wordologyArr.Add(WO); } } } } catch { } try { index = Wnlib.Index.lookup(concept.ToLower(), PartOfSpeech.of(PartsOfSpeech.Adj)); if (index != null) { WO.Pos = "adj"; Opt[] relatedness = WordsMatching.Relatedness.GetRelatedness(PartsOfSpeech.Adj); foreach (Opt o in relatedness) { for (int senseNumber = 0; senseNumber < index.sense_cnt; senseNumber++) { se = new Search(concept, true, PartOfSpeech.of("adj"), o.sch, senseNumber); SynSet sense = new SynSet(index, senseNumber, se); WO.Sense = sense.defn; WO.Concept = concept; WO.Word = concept; WO.ID = ID; ID++; NoOfSensesSucceeded++; AllSensesMapped++; //bf.Serialize(fs, "\n" + WO); wordologyArr.Add(WO); } } } } catch { } try { index = Wnlib.Index.lookup(concept.ToLower(), PartOfSpeech.of(PartsOfSpeech.Adv)); if (index != null) { WO.Pos = "adv"; Opt[] relatedness = WordsMatching.Relatedness.GetRelatedness(PartsOfSpeech.Noun); foreach (Opt o in relatedness) { for (int senseNumber = 0; senseNumber < index.sense_cnt; senseNumber++) { se = new Search(concept, true, PartOfSpeech.of("adv"), o.sch, senseNumber); SynSet sense = new SynSet(index, senseNumber, se); WO.Sense = sense.defn; WO.Concept = concept; WO.Word = concept; WO.ID = ID; ID++; NoOfSensesSucceeded++; AllSensesMapped++; //bf.Serialize(fs, "\n" + WO); wordologyArr.Add(WO); } } } } catch { } if (NoOfSensesSucceeded != 0) { conceptcounter++; } } }//end while allConceptsFileReader.Close(); allConceptsFile.Close(); bf.Serialize(fs, wordologyArr); fs.Close(); MessageBox.Show("no map-lex concepts number = " + NoMapLexConcepts.ToString()); MessageBox.Show("can't getsense pos number = " + CannotGetSenseExeption.ToString()); MessageBox.Show(conceptcounter.ToString()); }
/// <summary> /// Initializes a new instance of the <see cref="WordNetMemoryProvider"/> class. /// </summary> /// <param name="dataPath">The data path.</param> /// <exception cref="System.ArgumentNullException">dataPath</exception> /// <exception cref="System.IO.DirectoryNotFoundException">The data directory does not exist.</exception> /// <exception cref="System.IO.FileNotFoundException">A required WordNet file does not exist: [filename]</exception> public WordNetMemoryProvider(string dataPath) { if (string.IsNullOrEmpty(dataPath)) throw new ArgumentNullException("dataPath"); var dir = new DirectoryInfo(dataPath); if (!dir.Exists) throw new DirectoryNotFoundException("The data directory does not exist."); var dataPaths = new [] { new FileInfo(Path.Combine(dataPath, "data.adj")), new FileInfo(Path.Combine(dataPath, "data.adv")), new FileInfo(Path.Combine(dataPath, "data.noun")), new FileInfo(Path.Combine(dataPath, "data.verb")) }; var indexPaths = new [] { new FileInfo(Path.Combine(dataPath, "index.adj")), new FileInfo(Path.Combine(dataPath, "index.adv")), new FileInfo(Path.Combine(dataPath, "index.noun")), new FileInfo(Path.Combine(dataPath, "index.verb")) }; foreach (var file in dataPaths.Union(indexPaths).Where(file => !file.Exists)) throw new FileNotFoundException("A required WordNet file does not exist: " + file.Name); // Pass 1: Get total number of synsets var totalSynsets = 0; foreach (var dataInfo in dataPaths) { // scan synset data file for lines that don't start with a space... // these are synset definition lines using (var dataFile = new StreamReader(dataInfo.FullName)) { string line; while ((line = dataFile.ReadLine()) != null) { var firstSpace = line.IndexOf(' '); if (firstSpace > 0) ++totalSynsets; } } } // Pass 2: Create synset shells (pos and offset only) idSynset = new Dictionary<string, SynSet>(totalSynsets); foreach (var dataInfo in dataPaths) { var pos = WordNetFileProvider.GetFilePos(dataInfo.FullName); // scan synset data file using (var dataFile = new StreamReader(dataInfo.FullName)) { string line; while ((line = dataFile.ReadLine()) != null) { var firstSpace = line.IndexOf(' '); if (firstSpace <= 0) continue; // get offset and create synset shell var offset = int.Parse(line.Substring(0, firstSpace)); var synset = new SynSet(pos, offset, null); idSynset.Add(synset.Id, synset); } } } // Pass 3: Instantiate synsets (hooks up relations, set glosses, etc.) foreach (var dataInfo in dataPaths) { var pos = WordNetFileProvider.GetFilePos(dataInfo.FullName); // scan synset data file using (var dataFile = new StreamReader(dataInfo.FullName)) { string line; while ((line = dataFile.ReadLine()) != null) { var firstSpace = line.IndexOf(' '); if (firstSpace > 0) // instantiate synset defined on current line, using the instantiated synsets for all references idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, idSynset); } } } // organize synsets by pos and words... // also set most common synset for word-pos pairs that have multiple synsets posWordSynSets = new Dictionary<WordNetPos, Dictionary<string, List<SynSet>>>(); foreach (var indexInfo in indexPaths) { var pos = WordNetFileProvider.GetFilePos(indexInfo.FullName); posWordSynSets.EnsureContainsKey(pos, typeof(Dictionary<string, List<SynSet>>)); // scan word index file, skipping header lines using (var indexFile = new StreamReader(indexInfo.FullName)) { string line; while ((line = indexFile.ReadLine()) != null) { var firstSpace = line.IndexOf(' '); if (firstSpace <= 0) continue; // grab word and synset shells, along with the most common synset var word = line.Substring(0, firstSpace); SynSet mostCommonSynSet; var synsets = WordNetFileProvider.GetSynSetShells(line, pos, out mostCommonSynSet, wordNet); // set flag on most common synset if it's ambiguous if (synsets.Count > 1) idSynset[mostCommonSynSet.Id].SetAsMostCommonSynsetFor(word); // use reference to the synsets that we instantiated in our three-pass routine above posWordSynSets[pos].Add(word, new List<SynSet>(synsets.Count)); foreach (var synset in synsets) posWordSynSets[pos][word].Add(idSynset[synset.Id]); } } } }
private void ss2_DoubleClick(object sender, EventArgs e) { ss2.Text = _origSsLbl; _semSimSs2 = null; computeSemSim.Enabled = false; }
public abstract double ComputeSimilarity(SynSet synSet1, SynSet synSet2);