/// <summary> /// Processes the specified chunk table into semantic cloud /// </summary> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="subjects">The subjects.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> /// <exception cref="aceScienceException">This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + /// ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense. - null - cloudConstructor has irrational settings</exception> public lemmaSemanticCloud process(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver) { if (output == null) { output = new lemmaSemanticCloud(); output.className = termTable.name; } switch (settings.algorithm) { case cloudConstructorAlgorithm.complex: output = processPOSEnhanced(chunkTable, termTable, output, logger, subjects, resolver); break; case cloudConstructorAlgorithm.standard: output = processStandard(chunkTable, termTable, output, logger, subjects); break; case cloudConstructorAlgorithm.alternative: output = processAlternative(chunkTable, termTable, output, logger, subjects, resolver); break; } output.RebuildIndex(); output.weaverReport = settings.cloudWeaver.Process(output, logger); output.RebuildIndex(); return(output); }
public override void OnLoaded() { WLTableOfIndustryClass = new webLemmaTermTable(folder.pathFor(name + "tfidfTable.xml"), nameof(WLTableOfIndustryClass)); if (WLTableOfIndustryClass.Count == 0) { doBuild = true; } }
protected webLemmaTermTable BuildLemmaTableForClass(classifierTools tools, IDocumentSetClass documentSetClass, List <pipelineTaskMCSiteSubject> sites) { var context = items[documentSetClass.name]; experimentContext.notes.log("Master TF-IDF table construction (used for POS flagging)... [" + documentSetClass.name + "]"); webLemmaTermTable lemmaTable = knowledgeByClass[documentSetClass].WLTableOfIndustryClass; // new webLemmaTermTable(experimentContext.folder.pathFor("master_table_" + documentSetClass.name + ".xml"), true, "master_table_" + documentSetClass.name); lemmaTable.Clear(); experimentContext.masterConstructor.process(GetTokensForSites <IPipelineTaskSubject>(sites), cnt_level.mcPage, lemmaTable, tools.GetLemmaResource(), context.logger, false); //lemmaTableByClass.TryAdd(documentSetClass, lemmaTable); return(lemmaTable); }
public override void OnLoaded() { WLTableOfIndustryClass = new webLemmaTermTable(folder.pathFor(name + "WLTable.xml", getWritableFileMode.none, "Web Lemma TF-IDF table for [" + name + "] - as XML Serialized object"), name + "Lemmas"); WLChunkTableOfIndustryClass = new webLemmaTermTable(folder.pathFor(name + "WLChunkTable.xml", getWritableFileMode.none, "Chunks TF-IDF table for [" + name + "] - as XML Serialized object"), name + "Chunks"); if (semanticCloud == null) { semanticCloud = lemmaSemanticCloud.Load <lemmaSemanticCloud>(folder.pathFor(name + "Cloud.xml", getWritableFileMode.existing, "Initial version of the semantic cloud, extracted from the Chunk Table"), true); semanticCloud.name = name; } if (semanticCloudFiltered == null) { semanticCloudFiltered = lemmaSemanticCloud.Load <lemmaSemanticCloud>(folder.pathFor(name + "CloudFiltered.xml", getWritableFileMode.existing, "Initial version of the semantic cloud, extracted from the Chunk Table"), true); semanticCloudFiltered.name = name + "flt"; } }
protected lemmaSemanticCloud processStandard(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects) { if (output == null) { output = new lemmaSemanticCloud(); } List <webLemmaTerm> allChunks = chunkTable.GetList(); IEnumerable <webLemmaTerm> docSetFreq = null; if (subjects.Count > 1) { docSetFreq = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit); } else { docSetFreq = allChunks; } // allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit); Dictionary <List <String>, webLemmaTerm> weightDict = new Dictionary <List <String>, webLemmaTerm>(); List <List <String> > lemmasList = new List <List <string> >(); List <String> nodeNames = new List <string>(); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); if (lemmas.Count > 1) { lemmas.Sort((x, y) => String.CompareOrdinal(x, y)); lemmasList.Add(lemmas); weightDict.Add(lemmas, chunk); nodeNames.AddRange(lemmas, true); } } foreach (String n in nodeNames) // <------------ creating nodes { Double weight = 0; if (settings.assignTermTableWeightToNode) { weight = termTable.ResolveSingleTerm(n, logger); } else { weight = 1; } if (weight > 0) { output.AddNode(n, weight); } } foreach (List <String> n in lemmasList) // <-------- creating links { String first = n[0]; if (output.ContainsNode(first, true)) { foreach (String m in n) { if (m != first) { if (output.ContainsNode(m, true)) { Double weight = 1; if (settings.assignChunkTableWeightToLink) { weight = weightDict[n].weight; } else { if (settings.doAdjustLinkWeightByChunkSize) { weight = (n.Count - 1).GetRatio(1); } else { weight = 1; } } var link = output.GetLink(first, m); if (link == null) { output.AddLink(first, m, weight); } else { if (settings.doSumExistingLinkWeights) { link.weight += weight; } else { // it will not create new link as it already exists // this is irrational in case settings.assignChunkTableWeightToLink is true if (settings.assignChunkTableWeightToLink) { throw new aceScienceException("This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense.", null, this, "cloudConstructor has irrational settings", settings); } } } } } } } } return(output); }
/// <summary> /// Builds the cloud - common part of the algorithm /// </summary> /// <param name="c">The c.</param> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> /// <exception cref="aceScienceException">This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + /// ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense. - null - cloudConstructor has irrational settings</exception> protected lemmaSemanticCloud BuildCloud(lemmaSemanticConstruct c, webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, ITextResourceResolver resolver) { c.TrashBin.ForEach(x => c.nodeNames.Remove(x)); foreach (String n in c.nodeNames) // <------------ creating nodes { Double weight = 0; if (settings.assignTermTableWeightToNode) { var lemma = termTable[n]; if (lemma != null) { weight = lemma.weight; } } else { weight = 1; } if (weight > 0) { if (c.isCaseCloud) { if (settings.doFactorToCaseClouds) { if (c.PrimaryTerms.Contains(n)) { output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2); } else if (c.SecondaryTerms.Contains(n)) { output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1); } else { output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0); } } else { output.AddNode(n, weight); } } else { // class cloud if (settings.doFactorToClassClouds) { if (c.PrimaryTerms.Contains(n)) { output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2); } else if (c.SecondaryTerms.Contains(n)) { output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1); } else { output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0); } } else { output.AddNode(n, weight); } } } } foreach (List <String> n in c.lemmasList) // <-------- creating links { String first = n[0]; if (c.TrashBin.Contains(first)) { continue; } if (output.ContainsNode(first, true)) { foreach (String m in n) { if (c.TrashBin.Contains(m)) { continue; } if (m != first) { if (output.ContainsNode(m, true)) { Double weight = 1; if (settings.assignChunkTableWeightToLink) { weight = c.weightDict[n].weight; } else { if (settings.doAdjustLinkWeightByChunkSize) { weight = (n.Count - 1).GetRatio(1); } else { weight = 1; } } var link = output.GetLink(first, m); if (link == null) { output.AddLink(first, m, weight); } else { if (settings.doSumExistingLinkWeights) { link.weight += weight; } else { // it will not create new link as it already exists // this is irrational in case settings.assignChunkTableWeightToLink is true if (settings.assignChunkTableWeightToLink) { throw new aceScienceException("This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense.", null, this, "cloudConstructor has irrational settings", settings); } } } } } } } } c.primaryChunks.ForEach(x => output.primaryChunks.Add(x.nominalForm)); c.secondaryChunks.ForEach(x => output.secondaryChunks.Add(x.nominalForm)); return(output); }
/// <summary> /// Processes the complex. /// </summary> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="subjects">The subjects.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> protected lemmaSemanticCloud processAlternative(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver) { if (output == null) { output = new lemmaSemanticCloud(); } lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects); List <webLemmaTerm> allChunks = chunkTable.GetList(); // <--------------------------------- DETECTING THE MOST IMPORTANT TERMS IEnumerable <webLemmaTerm> vipChunks = null; if (subjects.Count > 1) { vipChunks = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit); } else { vipChunks = allChunks; } instanceCountCollection <String> lemmaCounter = new instanceCountCollection <string>(); List <List <String> > primaryLemmaList = new List <List <String> >(); foreach (webLemmaTerm chunk in vipChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); lemmaCounter.AddInstanceRange(lemmas); } c.RelevantTerms = lemmaCounter.getSorted(); lemmaCounter.reCalculate(); foreach (String term in c.RelevantTerms) { if (lemmaCounter[term] == lemmaCounter.maxFreq) { c.PrimaryTerms.Add(term); } else if (lemmaCounter[term] > lemmaCounter.minFreq) { c.SecondaryTerms.Add(term); } else { c.ReserveTerms.Add(term); } } c.CollectRelevantTerms(settings.doReserveTermsForClass); c.LogConstruct(logger); // <--------------------------------- var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y))); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); if (lemmas.Count > 1) { lemmas.Sort((x, y) => String.CompareOrdinal(x, y)); c.lemmasList.Add(lemmas); c.weightDict.Add(lemmas, chunk); c.nodeNames.AddRange(lemmas, true); } } return(BuildCloud(c, chunkTable, termTable, output, logger, resolver)); }
/// <summary> /// Processes the position enhanced. /// </summary> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="subjects">The subjects.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> protected lemmaSemanticCloud processPOSEnhanced(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver) { List <webLemmaTerm> allChunks = chunkTable.GetList(); if (output == null) { output = new lemmaSemanticCloud(); } // <----------------- PRIMARY lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects); lemmaSemanticConstruct cl = new lemmaSemanticConstruct(subjects); while (c != cl) { c = cl; cl = lemmaSemanticConstruct.NextIteration(cl, resolver, allChunks, settings, subjects, logger); if (cl.createdInIteration > settings.primaryTermOptimizationIterationLimit) { c = cl; break; } if (cl.OptimizationDone) { break; } } c = cl; // <------------------- PRIM c.CollectRelevantTerms(settings.doReserveTermsForClass); if (!c.isCaseCloud) { c.LogConstruct(logger); } // <--------------------------------- var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y))); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); List <String> l_out = new List <string>(); foreach (String lm in lemmas) { if (c.NotProcessed(lm)) { var lu = resolver.GetLexicUnit(lm, logger); if (lu == null) { c.TrashBin.AddUnique(lm); } else { var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none); if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A })) { c.ReserveTerms.AddUnique(lm); l_out.Add(lm); } else { c.TrashBin.AddUnique(lm); } } } else { if (!c.TrashBin.Contains(lm)) { l_out.Add(lm); } } } if (l_out.Count > 1) { l_out.Sort((x, y) => String.CompareOrdinal(x, y)); c.lemmasList.Add(l_out); c.weightDict.Add(l_out, chunk); c.nodeNames.AddRange(l_out, true); } } return(BuildCloud(c, chunkTable, termTable, output, logger, resolver)); }
/// <summary> /// Constructs the webLemmaTable /// </summary> /// <param name="counter">The counter.</param> /// <param name="logger">The logger.</param> /// <param name="table">The table.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <returns></returns> public webLemmaTermTable process(TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table, Boolean forSingleWebSite = false) { List <String> tfdfList = counter.GetIndexForms(); tfdfList.Sort(String.CompareOrdinal); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count() + 500; List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count(); Int32 cycleLength = startIndex / 5; foreach (String term in tfdfList) { if (term != null) { List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); Double documentFrequency = 0; Double termFrequency = 0; TFDFContainer cn = counter.GetContainer(term); webLemmaTerm lemma = new webLemmaTerm(); if (cn != null) { lemma.nominalForm = cn.indexForm; lemma.name = cn.indexForm; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>(); if (document != null) { documents.AddUnique(document); imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet; if (docSet != null) { documentSet.AddUnique(docSet); } } termFrequency += 1; /* * if (cntPair.flagBag.Contains(cnt_containerType.link)) * { * termFrequency += settings.anchorTextFactor; * } * else if (cntPair.flagBag.Contains(cnt_containerType.title)) * { * termFrequency += settings.titleTextFactor; * } * else * { * termFrequency += settings.contentTextFactor; * }*/ // lemma.otherForms.AddUnique(cntPair.initialForm); } lemma.documentSetFrequency = documentSet.Count; lemma.AFreqPoints = cn.items.Count(); lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); } else { //lemma.otherForms.AddUnique(cn.items); } li++; i = i + 1; c = c + 1; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("Chunk TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ "); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing Chunk Lemma Frequency table at [" + li.ToString() + "]"); break; } } } // table.WriteOnlyMode = false; recompute(table, logger, forSingleWebSite, lemmas); // table.ReadOnlyMode = true; return(table); }
/// <summary> /// Processes the specified source. /// </summary> /// <param name="chunks">The source.</param> /// <param name="document_level">The document level.</param> /// <param name="table">The table.</param> /// <param name="parser">The parser.</param> /// <param name="logger">The logger.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <returns></returns> public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> chunks, cnt_level document_level, webLemmaTermTable table, ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false) { // List<pipelineTaskMCPageSubject> MCPageSubjects = new List<pipelineTaskMCPageSubject>(); TFDFCounter counter = new TFDFCounter(); var listChunks = chunks.ToList(); listChunks.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm)); foreach (pipelineTaskSubjectContentToken mcSubject in listChunks) { //var page = mcSubject.GetParentOfType<pipelineTaskMCPageSubject>(); //if (!MCPageSubjects.Contains(page)) //{ // MCPageSubjects.Add(page); // counter.NextDocument(); //} if (mcSubject.contentLevelType == cnt_level.mcChunk) { counter.Add(mcSubject.currentForm, mcSubject); } } return(process(counter, logger, table, forSingleWebSite)); }
/// <summary> /// Processes the specified source. /// </summary> /// <param name="source">The source.</param> /// <param name="document_level">The document level.</param> /// <param name="table">The table.</param> /// <param name="parser">The parser.</param> /// <param name="logger">The logger.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <param name="counter">The counter.</param> /// <returns></returns> public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, bool forSingleWebSite = false, TFDFCounter counter = null) { if (counter == null) { counter = prepareCounter(source); } List <String> tfdfList = counter.GetIndexForms(); tfdfList.Sort(String.CompareOrdinal); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count() + 500; List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count(); Int32 cycleLength = startIndex / 5; foreach (String term in tfdfList) { if (term != null) { List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); Double documentFrequency = 0; Double termFrequency = 0; TFDFContainer cn = counter.GetContainer(term); webLemmaTerm lemma = new webLemmaTerm(); if (cn != null) { lemma.nominalForm = cn.indexForm; lemma.name = cn.indexForm; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>(); if (document != null) { documents.AddUnique(document); imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet; if (docSet != null) { documentSet.AddUnique(docSet); } } termFrequency += 1; } lemma.documentSetFrequency = documentSet.Count; lemma.AFreqPoints = cn.items.Count(); lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); } else { //lemma.otherForms.AddUnique(cn.items); } li++; i = i + 1; c = c + 1; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("Token Streams TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ "); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing Token Streams TF processing at [" + li.ToString() + "]"); break; } } } recompute(table, logger, forSingleWebSite, lemmas); return(table); }
public industryLemmaRankTable process(webLemmaTermTable chunkTable, webLemmaTermTable termTable, industryLemmaRankTable output) { List <webLemmaTerm> allChunks = chunkTable.GetList(); var docSetFreq = allChunks.Where(x => x.documentSetFrequency > 1); instanceCountCollection <String> termCounter = new instanceCountCollection <string>(); aceDictionarySet <String, String> dict = new aceDictionarySet <string, string>(); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); termCounter.AddInstanceRange(lemmas); foreach (String lm in lemmas) { foreach (String lmi in lemmas) { if (lmi != lm) { dict[lm].AddUnique(lmi); } } } } List <String> primaries = new List <string>(); foreach (var pair in termCounter) { if (termCounter[pair] > 1) { primaries.Add(pair); industryLemmaTerm lemma = output.GetOrCreate(pair); lemma.termType = industryLemmaTermType.primary; lemma.weight = settings.PrimaryTermFactor * termTable[lemma.name].weight; lemma.nominalForm = pair; output.AddOrUpdate(lemma); if (dict.ContainsKey(lemma.nominalForm)) { foreach (String secLemmas in dict[lemma.nominalForm]) { industryLemmaTerm lemmaSec = output.GetOrCreate(secLemmas); if (lemmaSec.termType == industryLemmaTermType.none) { lemmaSec.termType = industryLemmaTermType.secondary; lemmaSec.weight = settings.SecondaryTermFactor * termTable[lemmaSec.name].weight; lemmaSec.nominalForm = secLemmas; output.AddOrUpdate(lemmaSec); } } } } } //var reserveChunks = allChunks.Where(x => x.nominalForm.ContainsAny(primaries)); //aceDictionarySet<String, String> dictReserve = new aceDictionarySet<string, string>(); //foreach (webLemmaTerm chunk in reserveChunks) //{ // var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true); // lemmas = lemmas.Where(x => x.Length > 2).ToList(); // String prim = lemmas.FirstOrDefault(x => primaries.Contains(x)); // if (!prim.isNullOrEmpty()) // { // foreach (String lm in lemmas) // { // if (prim != lm) // { // dictReserve[prim].AddUnique(lm); // } // } // } //} //foreach (String prim in primaries) //{ // if (dictReserve.ContainsKey(prim)) // { // foreach (String res in dictReserve[prim]) // { // industryLemmaTerm resLemma = output.GetOrCreate(res); // if (resLemma.termType == industryLemmaTermType.none) // { // resLemma.nominalForm = res; // resLemma.weight = settings.ReserveTermFactor *termTable[resLemma.name].weight; // resLemma.termType = industryLemmaTermType.reserve; // } // output.AddOrUpdate(resLemma); // } // } //} return(output); }
//process(IEnumerable<IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table, ILogBuilder logger = null, Boolean forSingleWebSite = false)* public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false) { if (table.Count > 0) { logger.log("THIS TABLE " + table.name + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]"); if (DoBeep == 1) { imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250); Interlocked.Increment(ref DoBeep); } return(table); } TFDFCounter counter = new TFDFCounter(); lock (getAllChildrenLock) { var listSource = source.ToList(); // listSource.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm)); source = listSource; } List <IPipelineTaskSubject> rkns = source.GetSubjectsOfLevel <IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcToken }); // source.GetSubjectChildrenTokenType<pipelineTaskSubjectContentToken, IPipelineTaskSubject>(); rkns.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm)); //var tkns = source.GetSubjectsOfLevel(cnt_level.mcToken); Int32 shorties = 0; foreach (var tkn in rkns) { //if (tkn.currentForm.Length > 1) //{ if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters)) { counter.Add(tkn.currentForm.ToLower(), tkn); } //} else //{ // shorties++; //} } if (shorties > 0) { logger.log("[" + shorties + "] too short tokens removed"); } /* * * List<pipelineTaskSubjectContentToken> MCPageSubjects = source.ToSubjectTokenType<pipelineTaskSubjectContentToken>(); * * * * foreach (pipelineTaskSubjectContentToken mcSubject in MCPageSubjects) * { * counter.NextDocument(); * List<pipelineTaskSubjectContentToken> tkns = new List<pipelineTaskSubjectContentToken>(); * lock (getAllChildrenLock) * { * tkns = mcSubject.getAllChildrenInType<pipelineTaskSubjectContentToken>(null, false, false).GetSubjectsOfLevel(cnt_level.mcToken); * } * foreach (var tkn in tkns) * { * if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters)) * { * counter.Add(tkn.currentForm.ToLower(), tkn); * } * } * } */ return(process(table.name, parser, counter, logger, table, forSingleWebSite)); }
/// <summary> /// Constructs the webLemmaTable /// </summary> /// <param name="tableName">Name of the table.</param> /// <param name="parser">The parser.</param> /// <param name="counter">The counter.</param> /// <param name="logger">The logger.</param> /// <returns></returns> protected webLemmaTermTable process(String tableName, ITextResourceResolver parser, TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table = null, Boolean forSingleWebSite = false) { if (table == null) { table = new webLemmaTermTable(tableName); } if (table.Count > 0) { logger.log("THIS TABLE " + tableName + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]"); if (DoBeep == 1) { imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250); Interlocked.Increment(ref DoBeep); } return(table); } List <String> tfdfList = counter.GetIndexForms(); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count + 100; if (!tableName.isNullOrEmpty()) { table.name = tableName; } List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count; Int32 cycleLength = startIndex / 5; while (tfdfList.Any()) { String term = tfdfList.FirstOrDefault(); Int32 d = tfdfList.Count; if (term != null) { lexicGraphSetWithLemma inflectSet = parser.GetLemmaSetForInflection(term, tfdfList, logger); d = d - tfdfList.Count; if (d == 0) { table.unresolved.Add(term); tfdfList.Remove(term); d = 1; } else { Boolean ok = true; if (settings.allowedLemmaTypes.Any()) { var tps = inflectSet.GetTagsFromGramTags <pos_type>(pos_type.none); if (settings.strictPosTypePolicy) { if (!tps.ContainsAny(settings.allowedLemmaTypes)) { ok = false; } else { if (tps.Contains(pos_type.V)) { ok = false; } //foreach (pos_type t in tps) //{ // if (!settings.allowedLemmaTypes.Contains(t)) // { // ok = false; // break; // } //} } } else { if (!tps.ContainsAny(settings.allowedLemmaTypes)) { ok = false; } else { } } } else { } if (ok) { List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); webLemmaTerm lemma = new webLemmaTerm(); lemma.nominalForm = inflectSet.lemmaForm; lemma.name = inflectSet.lemmaForm; Double documentFrequency = 0; Double termFrequency = 0; foreach (lexicInflection inflect in inflectSet.Values) { TFDFContainer cn = counter.GetContainer(inflect.inflectedForm); if (cn != null) { lemma.AFreqPoints += cn.items.Count; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair.mcElement.GetParentOfType <imbMCDocument>(); documents.AddUnique(document); imbMCDocumentElement docSet = document?.parent as imbMCDocumentElement; if (docSet != null) { documentSet.AddUnique(docSet); } else { logger.log(cn.indexForm + " (" + cntPair.mcElement.toStringSafe("mcElement=null") + ")"); } if (cntPair.flagBag.Contains(cnt_containerType.link)) { termFrequency += settings.anchorTextFactor; } else if (cntPair.flagBag.Contains(cnt_containerType.title)) { termFrequency += settings.titleTextFactor; } else { termFrequency += settings.contentTextFactor; } cntPair.AddGraph(inflect); } lemma.otherForms.AddUnique(cn.indexForm); } else { lemma.otherForms.AddUnique(inflect.inflectedForm); } } lemma.documentSetFrequency = documentSet.Count; lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); //table.Add(lemma); } else { } } } li++; i = i + d; c = c + d; d = startIndex - tfdfList.Count; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("TF-IDF processed: _" + d.GetRatio(startIndex).ToString("P2") + "_"); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing WEB Lemma Frequency table at [" + li.ToString() + "]"); break; } } if (settings.doComputeTFIDF) { recompute(table, logger, forSingleWebSite, lemmas); } else { foreach (var le in lemmas) { table.Add(le); } } // table.ReadOnlyMode = true; return(table); }
/// <summary> /// Recomputes the specified table. /// </summary> /// <param name="table">The table.</param> /// <param name="logger">The logger.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <param name="lemmas">The lemmas.</param> /// <returns></returns> /// <exception cref="aceGeneralException">Permanent Add() lemma problem at [" + table.name + "] - Permanent Lemma TF-IDF Add(Lemma) failure</exception> public virtual webLemmaTermTable recompute(webLemmaTermTable table, ILogBuilder logger, Boolean forSingleWebSite, List <webLemmaTerm> lemmas) { kernelComputeWeightTask kernelTask = new kernelComputeWeightTask(lemmas, logger, forSingleWebSite, settings); ITermWeightKernel kernel = kernelManager.GetKernel(weightKernelName); kernel.compute(kernelTask); #region OLD_CODE /* * Double documentSetFrequencyMax = 0; * Double documentFrequencyMax = 0; * Double termFrequencyMax = 0; * * // List<webLemmaTerm> lemmas = tabl//e.GetList(); * * if (lemmas.Count == 0) * { * logger.log("ERROR: NO ENTRIES IN TF-TDF TABLE [" + table.name + "] - is for single web site [" + forSingleWebSite.ToString() + "]"); * } * * foreach (webLemmaTerm lemma in lemmas) * { * documentSetFrequencyMax = Math.Max(documentSetFrequencyMax, lemma.documentSetFrequency); * documentFrequencyMax = Math.Max(documentFrequencyMax, lemma.documentFrequency); * termFrequencyMax = Math.Max(termFrequencyMax, lemma.termFrequency); * } * * if (forSingleWebSite) { * if (settings.doAdjustIDFForCase) * { * documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection; * } else * { * documentFrequencyMax = documentFrequencyMax + settings.documentFrequencyMaxCorrection; * } * } else * { * documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection; * } * * /// COMPUTING NON NORMALIZED WEIGHTs * /// * Double weightMax = Double.MinValue; * * foreach (webLemmaTerm lemma in lemmas) * { * lemma.termFrequency = lemma.termFrequency.GetRatio(termFrequencyMax); * * if (settings.doUseIDF) * { * if (settings.doUseNaturalLog) * { * lemma.documentFactor = Math.Log(documentFrequencyMax.GetRatio(lemma.documentFrequency)); * } else * { * lemma.documentFactor = Math.Log10(documentFrequencyMax.GetRatio(lemma.documentFrequency)); * } * } else * { * lemma.documentFactor = 1; * } * * lemma.weight = lemma.termFrequency * lemma.documentFactor; * * if (settings.doUseDocumentSet) * { * if ((documentSetFrequencyMax != 1) || !forSingleWebSite) * { * if (lemma.documentSetFrequency == 0) * { * lemma.weight = 0; * } * else * { * Double docSetFactor = (1 - Math.Log10(documentSetFrequencyMax / lemma.documentSetFrequency)); * lemma.weight = lemma.weight * docSetFactor; * } * } * } * * weightMax = Math.Max(weightMax, lemma.weight); * } * * /// WEIGHT NORMALIZATION * foreach (webLemmaTerm lemma in lemmas) * { * lemma.weight = lemma.weight.GetRatio(weightMax); * } */ #endregion OLD_CODE /// SAVING THE RESULTS Int32 globalRetry = retry_global_limit; foreach (webLemmaTerm lemma in lemmas) { Int32 retry = retry_limit; while (retry > 0) { try { table.Add(lemma); retry = 0; } catch (Exception ex) { retry--; globalRetry--; if (doBeep) { logger.log("WFT [" + table.name + "] add lemma [" + lemma.name + "] retries left [" + retry + "] global[" + globalRetry + "]"); imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 200, 1); } Thread.Sleep(250); if (globalRetry < 0) { throw new aceGeneralException("Permanent Add() lemma problem at [" + table.name + "]", ex, this, "Permanent Lemma TF-IDF Add(Lemma) failure"); } } } } logger.log("WFT [" + table.name + "] recomputed TFmax[" + kernelTask.weightMax + "] : DFmax[" + kernelTask.documentFrequencyMax + "] TC[" + lemmas.Count + "]"); return(table); }
public virtual webLemmaTermTable recompute(webLemmaTermTable table, ILogBuilder logger, Boolean forSingleWebSite, List <webLemmaTerm> lemmas) { Double documentSetFrequencyMax = 0; Double documentFrequencyMax = 0; Double termFrequencyMax = 0; // List<webLemmaTerm> lemmas = tabl//e.GetList(); if (lemmas.Count == 0) { logger.log("ERROR: NO ENTRIES IN TF-TDF TABLE [" + table.name + "] - is for single web site [" + forSingleWebSite.ToString() + "]"); } foreach (webLemmaTerm lemma in lemmas) { documentSetFrequencyMax = Math.Max(documentSetFrequencyMax, lemma.documentSetFrequency); documentFrequencyMax = Math.Max(documentFrequencyMax, lemma.documentFrequency); termFrequencyMax = Math.Max(termFrequencyMax, lemma.termFrequency); } if (forSingleWebSite) { if (settings.doAdjustIDFForCase) { documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection; } else { documentFrequencyMax = documentFrequencyMax + settings.documentFrequencyMaxCorrection; } } else { documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection; } Double weightMax = Double.MinValue; foreach (webLemmaTerm lemma in lemmas) { lemma.termFrequency = lemma.termFrequency.GetRatio(termFrequencyMax); if (settings.doUseIDF) { if (settings.doUseNaturalLog) { lemma.documentFactor = Math.Log(documentFrequencyMax.GetRatio(lemma.documentFrequency)); } else { lemma.documentFactor = Math.Log10(documentFrequencyMax.GetRatio(lemma.documentFrequency)); } } else { lemma.documentFactor = 1; } lemma.weight = lemma.termFrequency * lemma.documentFactor; if (settings.doUseDocumentSet) { if ((documentSetFrequencyMax != 1) || !forSingleWebSite) { if (lemma.documentSetFrequency == 0) { lemma.weight = 0; } else { Double docSetFactor = (1 - Math.Log10(documentSetFrequencyMax / lemma.documentSetFrequency)); lemma.weight = lemma.weight * docSetFactor; } } } weightMax = Math.Max(weightMax, lemma.weight); } foreach (webLemmaTerm lemma in lemmas) { lemma.weight = lemma.weight.GetRatio(weightMax); } Int32 globalRetry = retry_global_limit; foreach (webLemmaTerm lemma in lemmas) { Int32 retry = retry_limit; while (retry > 0) { try { table.Add(lemma); retry = 0; } catch (Exception ex) { retry--; globalRetry--; if (doBeep) { logger.log("WFT [" + table.name + "] add lemma [" + lemma.name + "] retries left [" + retry + "] global[" + globalRetry + "]"); imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 200, 1); } Thread.Sleep(250); if (globalRetry < 0) { throw new aceGeneralException("Permanent Add() lemma problem at [" + table.name + "]", ex, this, "Permanent Lemma TF-IDF Add(Lemma) failure"); } } } } logger.log("WFT [" + table.name + "] recomputed TFmax[" + weightMax + "] : DFmax[" + documentFrequencyMax + "] TC[" + lemmas.Count + "]"); return(table); }