public static termExploreModel getSynonymsWithSerbianWordNet(termExploreModel model, ILogBuilder response) { tokenGraph result = new tokenGraph(model.lemma.inputForm); languageManagerWordnet.manager.queryWithGraph(result, response, WordnetSource.serbian, WordnetQueryType.getSymsetCodesByWord); model.wordnetSecondarySymsets.AddRange(result.getAllLeafs().getNames()); if (response != null) { response.consoleAltColorToggle(); string rst = result.ToStringTreeview(); response.Append(rst); response.consoleAltColorToggle(); } languageManagerWordnet.manager.queryWithGraph(result, response, WordnetSource.serbian, WordnetQueryType.getWordsBySymsetCode); model.synonyms.AddRange(result.getAllLeafs().getNames()); if (response != null) { response.consoleAltColorToggle(); string rst = result.ToStringTreeview(); response.Append(rst); response.consoleAltColorToggle(); } model.graph = result; return(model); }
/// <summary> /// Retrieves the lexicInflection information, if required it will process the gram tags /// </summary> /// <param name="key">The key.</param> /// <param name="logger">The logger.</param> /// <returns></returns> protected lexicInflection resolve(String key, ILogBuilder logger = null) { var output = GetLexicUnitBase(key, logger); if (output == null) { if (settings.doLogUnresolvedTokens) { if (logger != null) { logger.Append("![" + key + "] "); } } } else { foreach (lexicGrammarCase ch in output) { if (ch.tags == null) { ch.tags = grammTagConverter.ConvertFromString(ch.lexicalDefinitionLine); } } } return(output); }
public DataTable getLemmaStats(ILogBuilder log) { DataTable output = new DataTable("LexiconLemmas"); output.SetDescription("List of all lemma definition contained in the lexicon"); var dc_lemma = output.Columns.Add("Lemma"); var dc_type = output.Columns.Add("Type"); var dc_relTo = output.Columns.Add("RelatedTo").SetValueType(typeof(int)); var dc_relFrom = output.Columns.Add("RelatedFrom").SetValueType(typeof(int)); var dc_instances = output.Columns.Add("Instances").SetValueType(typeof(int)); var dc_concepts = output.Columns.Add("Concepts").SetValueType(typeof(int)); var dc_conn = output.Columns.Add("Concept_names"); var dc_reln = output.Columns.Add("Relationships"); var dc_relni = output.Columns.Add("Relationships_and_instances"); int i = 0; int index = 0; int end = lexiconContext.TermLemmas.Count(); int step = end / 20; foreach (ITermLemma lemma in lexiconContext.TermLemmas) { var dr = output.NewRow(); dr[0] = lemma.name; dr[1] = lemma.type; dr[2] = lemma.relatedTo.Count; dr[3] = lemma.relatedFrom.Count; dr[4] = lemma.instances.Count; dr[5] = lemma.concepts.Count; string connames = ""; foreach (IConcept c in lemma.concepts) { connames = connames.add(c.name, ","); } dr[6] = connames; int rel = lemma.relatedTo.Count + lemma.relatedFrom.Count + lemma.concepts.Count; dr[7] = rel; dr[8] = rel + lemma.instances.Count; output.Rows.Add(dr); i++; index++; if (index > step) { index = 0; double r = ((double)i) / ((double)end); log.Append(" [" + r.ToString("P") + "] "); } } output.AddExtra("Total lemmas: " + end); int noConRel = output.Select(dc_concepts.ColumnName + " = 0").count(); int noTotRel = output.Select(dc_reln.ColumnName + " = 0").count(); output.AddExtra("Without concept relationship: " + noConRel + " (" + noConRel.imbGetPercentage(end) + ")"); output.AddExtra("Without any relationship: " + noTotRel + " (" + noTotRel.imbGetPercentage(end) + ")"); return(output); }
/// <summary> /// Renders the specified set of WebSiteDocuments into List of <see cref="TextDocumentSet"/>s /// </summary> /// <param name="input">The input.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public List <TextDocumentSet> RenderDocumentSet(WebSiteDocumentsSet input, ILogBuilder logger) { List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>(); Int32 target = input.Count; Int32 ti = 0; foreach (WebSiteDocuments webSite in input) { //if (GroupSiteDocuments) //{ TextDocumentSet textSet = RenderSiteDocuments(webSite, logger); textSetForLabel.Add(textSet); //} else //{ // foreach (WebSiteDocument webPage in webSite.documents) // { // TextDocumentSet textSet = new TextDocumentSet(webPage.AssociatedID); // TextDocumentLayerCollection pg = RenderText(webPage, webSite); // pg.name = webPage.AssociatedID; // textSet.Add(pg); // textSetForLabel.Add(textSet); // } //} ti++; Double done = ti.GetRatio(target); logger.Append(" [" + done.ToString("P2") + "] "); } return(textSetForLabel); }
/// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) { log.log("... Category Similarity ..."); List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList(); Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>(); foreach (SpaceLabel label in context.spaceModel.labels) { Relationship <SpaceLabel, SpaceCategoryModel> categoryModel = context.spaceModel.LabelToCategoryLinks.GetAllRelationships(label).FirstOrDefault(); var c = TermWeightModel.GetWeights(selectedTerms, categoryModel.NodeB, context.spaceModel, label); categoryDictionarties.Add(label.name, c); } FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); String domainNameLast = ""; Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 20); foreach (var entry in context.items) { i++; WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); FeatureVector fv = new FeatureVector(entry.AssignedID); fv.dimensions = new double[context.spaceModel.labels.Count]; Int32 c = 0; Parallel.ForEach(context.spaceModel.labels, (label) => { var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[label.name], documentWeights); fv.dimensions[context.spaceModel.labels.IndexOf(label)] = docToClassSimilarity; }); Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } dict.GetOrAdd(entry.DomainID).Add(fv, -1); } foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict) { pair.Value.CloseDeploy(); } log.log("... Preparation done..."); return(dict); }
public static termExploreModel getSynonymsByCorpus(termExploreModel model, ILogBuilder response) { tokenGraph result = new tokenGraph(model.lemma.inputForm); var lines = semanticLexiconManager.manager.settings.sourceFiles.getOperater(lexiconSourceTypeEnum.corpus).Search(model.lemma.inputForm); result.Add(lines.getLineContentList(), tokenGraphNodeType.word_srb); model.synonyms.AddRange(result.getAllLeafs().getNames()); if (response != null) { response.consoleAltColorToggle(); string rst = result.ToStringTreeview(); response.Append(rst); response.consoleAltColorToggle(); } model.graph = result; return(model); }
public static termExploreModel getSynonymsWithSerbianWordNetAndApertium(termExploreModel model, ILogBuilder response) { model = getSynonymsWithSerbianWordNet(model, response); tokenGraph result = model.graph; languageManagerApertium.manager.queryByGraphNode(model.graph, apertiumDictQueryScope.exact, apertiumDictNeedleSide.native); model.translations.AddRange(result.getAllLeafs().getNames()); languageManagerApertium.manager.queryByGraphNode(result, apertiumDictQueryScope.exact, apertiumDictNeedleSide.translated); string st = result.ToStringTreeview(); if (response != null) { response.Append(st); } model.synonyms.AddRange(result.getAllLeafs().getNames()); model.graph = result; return(model); }
public static termExploreModel getSynonymsWithApertium(termExploreModel model, ILogBuilder response) { tokenGraph result = languageManagerApertium.manager.queryForGraph(model.lemma.inputForm, apertiumDictQueryScope.exact); if (result.Count() == 0) { model.wasExploreFailed = true; } else { model.translations.AddRange(result.getAllLeafs().getNames()); languageManagerApertium.manager.queryByGraphNode(result, apertiumDictQueryScope.exact, apertiumDictNeedleSide.translated); string st = result.ToStringTreeview(); if (response != null) { response.Append(st); } model.synonyms.AddRange(result.getAllLeafs().getNames()); } model.graph = result; return(model); }
public DataTable getConceptStats(ILogBuilder log) { DataTable output = new DataTable("LexiconConcepts"); output.SetDescription("List of all concept definition contained in the lexicon"); var dc_lemma = output.Columns.Add("Concept"); var dc_desc = output.Columns.Add("Description"); var dc_relTo = output.Columns.Add("RelatedTo").SetValueType(typeof(int)); var dc_relFrom = output.Columns.Add("RelatedFrom").SetValueType(typeof(int)); var dc_hyper = output.Columns.Add("Hyper"); var dc_hypo = output.Columns.Add("Hypo").SetValueType(typeof(int)); var dc_lemmas = output.Columns.Add("Lemmas").SetValueType(typeof(int)); output.Columns.Add("Lemmas names"); var dc_conn = output.Columns.Add("Concept_relationships").SetValueType(typeof(int)); var dc_reln = output.Columns.Add("Total_relationships").SetValueType(typeof(int)); int i = 0; int index = 0; int end = lexiconContext.Concepts.Count(); int step = end / 20; foreach (IConcept lemma in lexiconContext.Concepts) { var dr = output.NewRow(); dr[0] = lemma.name; dr[1] = lemma.description; dr[2] = lemma.relatedTo.Count; dr[3] = lemma.relatedFrom.Count; int rel = 0; if (lemma.hyperConcept != null) { rel++; dr[4] = lemma.hyperConcept.name; } else { dr[4] = "[none]"; } dr[5] = lemma.hypoConcepts.Count; dr[6] = lemma.lemmas.Count; string connames = ""; foreach (ITermLemma c in lemma.lemmas) { connames = connames.add(c.name, ","); } dr[7] = connames; rel += lemma.relatedTo.Count + lemma.relatedFrom.Count + lemma.lemmas.Count; dr[8] = rel; rel += lemma.lemmas.Count; dr[9] = rel; output.Rows.Add(dr); i++; index++; if (index > step) { double r = ((double)i) / ((double)end); log.Append(" [" + r.ToString("P") + "] "); index = 0; } } output.AddExtra("Total concept: " + end); int noConRel = output.Select(dc_conn.ColumnName + " = 0").count(); int noTotRel = output.Select(dc_reln.ColumnName + " = 0").count(); output.AddExtra("Without concept relationship: " + noConRel + " (" + noConRel.imbGetPercentage(end) + ")"); output.AddExtra("Without any relationship: " + noTotRel + " (" + noTotRel.imbGetPercentage(end) + ")"); return(output); }
public bool discoverGram(termExploreItem item, ILogBuilder loger, bool debug = true) { //List<termExploreItem> inst = new List<termExploreItem>(); //exploreModel.instances.ForEach(x => inst.Add(x)); //inst.Add(exploreModel); // instanceCountCollection<pos_type> pct = new instanceCountCollection<pos_type>(); bool failed = false; //// <--------------- Trying to resolve alone //foreach (termExploreItem item in inst) //{ if (loger != null) { loger.AppendLine("Item:" + item.inputForm); } instanceCountCollection <object> res = termDiscoveryResolver.resolveQuery(item.inputForm); res.reCalculate(); if (res.Count > 0) { List <object> sorted = res.getSorted(); if (item.gramSet.getPosType() != pos_type.none) { sorted.RemoveAll(x => x is pos_type); } gramFlags gf = new gramFlags(); if (sorted.Any(x => x is pos_type)) { gf.Set((pos_type)sorted.First(x => x is pos_type)); } //pct.AddInstance(gf.type, 1); var tl = posConverter.posTypeVsPattern[gf.type]; sorted.RemoveAll(x => !tl.Contains(x.GetType())); if (loger != null) { loger.AppendLine("Votes:"); for (int i = 0; i < Math.Max(sorted.Count(), 20); i++) { loger.Append(sorted[i].ToString() + "; "); } } if (sorted.Any(x => x is pos_gender)) { gf.Set((pos_gender)sorted.First(x => x is pos_gender)); } if (sorted.Any(x => x is pos_gramaticalCase)) { gf.Set((pos_gramaticalCase)sorted.First(x => x is pos_gramaticalCase)); } if (sorted.Any(x => x is pos_verbform)) { gf.Set((pos_verbform)sorted.First(x => x is pos_verbform)); } if (sorted.Any(x => x is pos_number)) { gf.Set((pos_number)sorted.First(x => x is pos_number)); } if (sorted.Any(x => x is pos_degree)) { gf.Set((pos_degree)sorted.First(x => x is pos_degree)); } if (sorted.Any(x => x is pos_person)) { gf.Set((pos_person)sorted.First(x => x is pos_person)); } if (loger != null) { loger.AppendLine("Final gram:" + gf.ToString()); } item.gramSet.Add(gf); } else { if (item.inputForm.Length < 4) { return(false); } //item.flags = termExploreItemEnumFlag.none; failed = true; } return(failed); }
/// <summary> /// Method: word -- translation --- synset ---- other synsets --- collecting all words --- translation --- word /// </summary> /// <param name="model">The model.</param> /// <param name="response">The response.</param> /// <param name="disableCodePrefixFilter">if set to <c>true</c> [disable code prefix filter].</param> /// <param name="disableCodeBranchFilter">if set to <c>true</c> [disable code branch filter].</param> /// <returns></returns> public static termExploreModel getSynonymsWithWordnetViaApertium(termExploreModel model, ILogBuilder response, bool disableCodePrefixFilter = false, bool disableCodeBranchFilter = false) { tokenGraph result = model.graph; result = languageManagerApertium.manager.queryForGraph(model.lemma.inputForm, apertiumDictQueryScope.exact); model.translations.AddRange(result.getAllLeafs().getNames()); if (response != null) { response.consoleAltColorToggle(); string rst = result.ToStringTreeview(); response.Append(rst); response.consoleAltColorToggle(); } languageManagerWordnet.manager.queryWithGraph(result, response, WordnetSource.english, WordnetQueryType.getSymsetCodesByWord); if (response != null) { response.consoleAltColorToggle(); string st = result.ToStringTreeview(); response.Append(st); response.consoleAltColorToggle(); } model.wordnetSecondarySymsets.AddRange(result.getAllLeafs().getDeepest().getNames()); if (!disableCodePrefixFilter) { string codeStart = model.lemma.gramSet.getPosType().GetWordNetCodeStart().ToString(); Regex codeCriteria = new Regex("^" + codeStart + ""); var badCodes = result.getAllLeafs(codeCriteria, true); if (response != null) { response.AppendHorizontalLine(); response.AppendLine("Reducing to proper codes [" + codeStart + "]->filtered-out[" + badCodes.Count() + "]"); } badCodes.removeFromParent(); if (response != null) { response.consoleAltColorToggle(); string rst = result.ToStringTreeview(); response.Append(rst); response.consoleAltColorToggle(); } } model.wordnetPrimarySymsets.AddRange(result.getAllLeafs().getDeepest().getNames()); languageManagerWordnet.manager.queryWithGraph(result, response, WordnetSource.english, WordnetQueryType.getWordsBySymsetCode); model.translationRelated.AddRange(result.getAllLeafs().getDeepest().getNames()); if (response != null) { response.AppendHorizontalLine(); response.AppendLine("Getting English words by symsetcodes via WordNet"); response.consoleAltColorToggle(); string rst = result.ToStringTreeview(); response.Append(rst); response.consoleAltColorToggle(); } languageManagerApertium.manager.queryByGraphNode(result, apertiumDictQueryScope.exact, apertiumDictNeedleSide.translated); model.wordnetSynonyms.AddRange(result.getAllLeafs().getDeepest().getNames()); if (response != null) { response.AppendHorizontalLine(); response.AppendLine("Translating back to Serbian via Apertium"); response.consoleAltColorToggle(); string rst = result.ToStringTreeview(); response.Append(rst); response.consoleAltColorToggle(); } if (!disableCodeBranchFilter) // <------ removes the symset nodes that contain none of first-level translation words { var codeLevel = result.getAllChildren().getOnLevel(3); List <IObjectWithPathAndChildren> toTakeOut = new List <IObjectWithPathAndChildren>(); foreach (var clb in codeLevel) { foreach (var clb_c in clb) { bool takeOut = true; foreach (var clb_cc in clb_c) { if (clb_cc.name == model.lemma.inputForm) { takeOut = false; break; } } if (takeOut) { if (response != null) { response.AppendLine("-- take out: " + clb.path); } toTakeOut.Add(clb); break; } } } toTakeOut.removeFromParent(); int wps = Enumerable.Count(model.wordnetSecondarySymsets); int tr = Enumerable.Count(model.translationRelated); int ws = Enumerable.Count(model.wordnetSynonyms); if (response != null) { response.AppendLine("----- Branch-node filter ----"); response.AppendLine("Symsets: " + wps); response.AppendLine("Translations: " + tr); response.AppendLine("Terms: " + ws); response.consoleAltColorToggle(); string rst = result.ToStringTreeview(); response.Append(rst); response.consoleAltColorToggle(); } model.wordnetPrimarySymsets = result.getAllChildren().getOnLevel(3).getNames(true); model.translations = result.getAllChildren().getOnLevel(4).getNames(true); model.synonyms = result.getAllChildren().getOnLevel(5).getNames(true); wps = wps - Enumerable.Count(model.wordnetPrimarySymsets); tr = tr - Enumerable.Count(model.translations); ws = ws - Enumerable.Count(model.synonyms); if (response != null) { //response.AppendLine("----- Branch-node filter ----"); response.AppendLine("Reduction of Symsets: " + wps); response.AppendLine("Reduction of Translations: " + tr); response.AppendLine("Reduction of Terms: " + ws); response.consoleAltColorToggle(); string rst = result.ToStringTreeview(); response.Append(rst); response.consoleAltColorToggle(); } } /* * String rgex_pat = "^([\\w]*\\\\[\\w]*\\\\[\\w]*\\\\[\\w]*\\\\{0}$)"; * * Regex rgex = new Regex(String.Format(rgex_pat, model.lemma.inputForm)); * * var onlyWithLemma = result.getAllLeafs().getFilterOut(rgex); */ //languageManagerApertium.manager.queryByGraphNode(result, apertiumDictQueryScope.exact, apertiumDictNeedleSide.english); model.graph = result; return(model); }
/// <summary> /// Loads all external plug-ins from the <see cref="folderNode"/> specified /// </summary> /// <param name="output">The log builder to output info to</param> /// <param name="altFolder">Alternative folder with plugins to load from, at the end of the process it will set back to the existing one (if there was no existing folder, it will set this as default)</param> public void loadPlugins(ILogBuilder output, folderNode altFolder = null) { folderNode old = folderWithPlugins; if (altFolder != null) { folderWithPlugins = altFolder; if (output != null) { output.log("Loading from alternative directory: " + folderWithPlugins.path); } } dllFileNames.AddRange(folderWithPlugins.findFiles("*.dll", System.IO.SearchOption.AllDirectories)); ICollection <Assembly> assemblies = new List <Assembly>(dllFileNames.Count); foreach (string dllFile in dllFileNames) { AssemblyName an = AssemblyName.GetAssemblyName(dllFile); try { Assembly assembly = Assembly.Load(an); //assemblies.Add(assembly); Type pluginType = typeof(IAcePluginBase); ICollection <Type> pluginTypes = new List <Type>(); //foreach (Assembly ass in assemblies) //{ if (assembly != null) { Type[] types = assembly.GetTypes(); foreach (Type type in types) { if (type.IsInterface || type.IsAbstract) { continue; } else { if (type.GetInterface(pluginType.FullName) != null) { registerPlugin(type, dllFile, output); } } } } //} } catch (IOException ex) { if (output != null) { output.log("Assembly load failed - [" + dllFile + "] - consider removing the file from the plugin directory. [" + ex.Message + "] "); } } catch (BadImageFormatException ex) { if (output != null) { output.log("Invalid assembly detected: remove dll file [" + dllFile + "] from the plugin directory. [" + ex.Message + "] "); output.open("fussion-log", "Assembly load failure log:", dllFile); output.Append(ex.FusionLog, imbSCI.Data.enums.appends.appendType.comment, true); output.close(); } } catch (Exception ex) { output.log("Plugin assembly import failed [" + dllFile + "] [" + ex.Message + "] "); } if (output != null) { output.log("Plugin assembly loaded: " + an.Name); } } if (old != null) { folderWithPlugins = old; } }
/// <summary> /// Transforms series of tokens into wparks /// </summary> /// <param name="tokens">The tokens.</param> /// <param name="expansion">The expansion.</param> /// <param name="loger">The loger.</param> /// <returns></returns> public static List <termSpark> getSparks(this List <string> tokens, int expansion = 1, ILogBuilder loger = null, bool debug = true) { List <string> output = new List <string>(); List <termSpark> sparks = new List <termSpark>(); if (!tokens.Any()) { return(sparks); } StringBuilder sb = new StringBuilder(); string qt = "start"; int tc = tokens.Count(); int i = 0; int ci = 0; int cl = tc / 10; while (!qt.isNullOrEmpty()) { i++; ci++; if (tokens.Any()) { qt = tokens.First(); tokens.Remove(qt); } else { qt = null; break; } termSpark spark = getExpandedSpark(qt, expansion, loger, debug); foreach (var it in spark.terms) { if (tokens.Remove(it.Key)) { spark.AFreqPoints++; spark.weight = spark.weight + it.Value.weight; } } if (loger != null) { sb.Append("[" + qt + "] " + tokens.Count().imbGetPercentage(tc, 2)); if (ci > cl) { ci = 0; loger.Append(sb.ToString()); sb.Clear(); } } if (spark.lemma != null) { sparks.Add(spark); } } return(sparks); }
///// <summary> ///// Transforms to fv dictionary. ///// </summary> ///// <param name="context">The context.</param> ///// <param name="TermWeightModel">The term weight model.</param> ///// <param name="function">The function.</param> ///// <returns></returns> //public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageInCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) //{ // log.log("... Page Similarity ..."); // List<string> selectedTerms = context.selectedFeatures.GetKeys(); // var ByDomain = context.GetByDomain(log); // Dictionary<string, List<string>> assignIDByLabel = context.featureSpace.labelToDocumentAssociations.GetAllRelationShipByName(true); // var ByCategory = context.GetByAssignIDCategory(assignIDByLabel,log); // Dictionary<String, List<DocumentSelectResultEntry>> EntryByLabel = new Dictionary<string, List<DocumentSelectResultEntry>>(); // Dictionary<String, WeightDictionary> documentDictionarties = new Dictionary<string, WeightDictionary>(); // foreach (var entry in context.items) // { // WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); // documentDictionarties.Add(entry.AssignedID, documentWeights); // } // FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); // Double total = context.Count; // Int32 i = 0; // Int32 p = (context.Count / 10); // //List<List<Double>> matrix = new List<List<double>>(); // //foreach (var entry in context.items) // //{ // // matrix.Add(new List<double>()); // //} // //for (int x = 0; x < context.items.Count; x++) // //{ // // for (int y = 0; y < context.items.Count; x++) // // { // // } // //} // ConcurrentDictionary<String, Double> computedPairs = new ConcurrentDictionary<string, double>(); // foreach (var domainPair in ByCategory) // { // List<DocumentSelectResultEntry> relatives = ByCategory[domainPair.Key].ToList(); // foreach (var entry in relatives) // { // i++; // FeatureVector fv = new FeatureVector(entry.AssignedID); // // List<Double> d = new List<>(); // fv.dimensions = new double[relatives.Count - 1]; // // List<String> keys = documentDictionarties.Keys.ToList(); // Int32 hostInd = relatives.IndexOf(entry); // Int32 c = 0; // //foreach (var pair in documentDictionarties) // //{ // Parallel.ForEach(relatives, (pair) => // { // Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID); // if (ind >= hostInd) // { // ind = ind - 1; // } // if (pair.AssignedID != entry.AssignedID) // { // Double docToClassSimilarity = 0; // if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) // { // docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID]; // } // else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) // { // docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID]; // } // else // { // var vecA = documentDictionarties[pair.AssignedID]; // var vecB = documentDictionarties[entry.AssignedID]; // docToClassSimilarity = function.ComputeSimilarity(vecA, vecB); // if (docToClassSimilarity > 0) // { // } // if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) // { // computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity); // //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity); // } // else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) // { // computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity); // } // } // fv.dimensions[ind] = docToClassSimilarity; // } // }); // Int32 r = i % p; // if (r == 0) // { // log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); // } // dict.GetOrAdd(entry.DomainID).Add(fv, -1); // } // } // //foreach (KeyValuePair<string, FeatureVectorWithLabelIDSet> pair in dict) // //{ // // pair.Value.CloseDeploy(); // //} // log.log("... Preparation finished ..."); // return dict; //} /// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsSiteSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) { log.log("... Site Similarity ..."); List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList(); Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>(); Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>(); var byDomain = context.GetByDomain(log); FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 10); foreach (var pair in byDomain) { i++; SpaceDocumentModel siteModel = new SpaceDocumentModel(); foreach (var ent in pair.Value) { WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, ent.spaceDocument, context.spaceModel); documentDictionarties.Add(ent.AssignedID, documentWeights); siteModel.Children.Add(ent.spaceDocument); //siteModel.terms.MergeDictionary(ent.spaceDocument.terms); } siteModel.Flatten(false); categoryDictionarties.Add(pair.Key, TermWeightModel.GetWeights(selectedTerms, siteModel, context.spaceModel)); foreach (var ent in pair.Value) { FeatureVector fv = new FeatureVector(ent.AssignedID); fv.dimensions = new double[context.spaceModel.labels.Count]; // documentDictionarties[ent.AssignedID].entries var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[pair.Key], documentDictionarties[ent.AssignedID]); fv.dimensions[0] = docToClassSimilarity; dict.GetOrAdd(pair.Key).Add(fv, -1); } Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } } foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict) { pair.Value.CloseDeploy(); } log.log("... Preparation finished ..."); return(dict); }
/// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ScoreComputationModeEnum groupmode, ILogBuilder log) { List <string> selectedTerms = context.selectedFeatures.GetKeys(); Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>(); foreach (var entry in context.items) { WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); documentDictionarties.Add(entry.AssignedID, documentWeights); } FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 10); Dictionary <string, List <DocumentSelectResultEntry> > relative_groups = null; if (groupmode == ScoreComputationModeEnum.category) { Dictionary <string, List <string> > assignIDByLabel = context.spaceModel.LabelToDocumentLinks.GetAllRelationShipByName(true); relative_groups = context.GetByAssignIDCategory(assignIDByLabel, log); if (assignIDByLabel.ContainsKey(SpaceLabel.UNKNOWN)) { assignIDByLabel.Remove(SpaceLabel.UNKNOWN); } log.log("... Page Similarity ... Groups by category"); } else if (groupmode == ScoreComputationModeEnum.site) { relative_groups = context.GetByDomain(log); log.log("... Page Similarity ... Groups by site"); } else if (groupmode == ScoreComputationModeEnum.dataset) { relative_groups = new Dictionary <string, List <DocumentSelectResultEntry> >(); relative_groups.Add("dataset", context.items); log.log("... Page Similarity ... dataset"); } ConcurrentDictionary <String, Double> computedPairs = new ConcurrentDictionary <string, double>(); foreach (var domainPair in relative_groups) { List <DocumentSelectResultEntry> relatives = domainPair.Value; //relative_groups[domainPair.Key].ToList(); foreach (var entry in relatives) { i++; FeatureVector fv = new FeatureVector(entry.AssignedID); // List<Double> d = new List<>(); fv.dimensions = new double[relatives.Count - 1]; // List<String> keys = documentDictionarties.Keys.ToList(); Int32 hostInd = relatives.IndexOf(entry); Int32 c = 0; //foreach (var pair in documentDictionarties) //{ Parallel.ForEach(relatives, (pair) => { Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID); if (ind >= hostInd) { ind = ind - 1; } if (pair.AssignedID != entry.AssignedID) { Double docToClassSimilarity = 0; if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID]; } else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID]; } else { var vecA = documentDictionarties[pair.AssignedID]; var vecB = documentDictionarties[entry.AssignedID]; docToClassSimilarity = function.ComputeSimilarity(vecA, vecB); if (docToClassSimilarity > 0) { } if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity); //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity); } else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity); } } fv.dimensions[ind] = docToClassSimilarity; } }); Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } dict.GetOrAdd(domainPair.Key).Add(fv, -1); } } log.log("... Preparation finished ..."); return(dict); }