/// <summary> /// Prepares the factors for score operation /// </summary> /// <param name="context">The context.</param> public void Prepare(DocumentSelectResult context, ILogBuilder log) { foreach (IScoreModelFactor factor in Factors) { factor.Prepare(context, log); } }
/// <summary> /// Computes score for given entry /// </summary> /// <param name="entry">The entry.</param> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <returns></returns> public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log) { Double output = 0; foreach (String term in entry.spaceDocument.terms.GetTokens()) { Boolean isOk = true; if (context.selectedFeatures != null) { if (context.selectedFeatures.Count > 0) { if (!context.selectedFeatures.ContainsKey(term)) { isOk = false; } } } if (isOk) { if (queryTerms.Any()) { if (queryTerms.Contains(term)) { output += weightDictionary.GetValue(term); // TermWeightModel.GetWeight(term, entry.spaceDocument, context.spaceModel); } } else { output += weightDictionary.GetValue(term); // TermWeightModel.GetWeight(term, entry.spaceDocument, context.spaceModel); } } } return(output); }
/// <summary> /// Merges the ds rankings - searches folder for specified input names or search pattern /// </summary> /// <param name="folder">The folder.</param> /// <param name="inputNames">The input names.</param> /// <param name="output">The output.</param> /// <param name="searchPattern">The search pattern.</param> /// <returns></returns> public static FeatureVectorDictionaryWithDimensions MergeDSRankings(folderNode folder, String inputNames, ILogBuilder output, String searchPattern = "DS_*_ranking.xml") { List <string> filepaths = folder.GetOrFindFiles(inputNames, searchPattern); DocumentSelectResult resultOut = new DocumentSelectResult(); List <DocumentSelectResult> results = new List <DocumentSelectResult>(); List <String> existingNames = new List <string>(); String tmpOutputName = ""; foreach (var fp in filepaths) { var lr = DocumentSelectResult.LoadFromFile(fp, output); String fn = Path.GetFileNameWithoutExtension(fp); if (existingNames.Contains(lr.name)) { lr.name = fn; } existingNames.Add(lr.name); results.Add(lr); tmpOutputName += lr.name; } FeatureVectorDictionaryWithDimensions featureDict = DocumentRankingExtensions.TransformToFVDictionary(results); return(featureDict); }
/// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <exception cref="ArgumentException">context</exception> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { /* * String p_m = WeightDictionary.GetDictionaryFilename(, context.folder); * * if (File.Exists(p_m)) * { * //objectSerialization.loadObjectFromXML<WeightDictionary>(p_m, log); * * } */ weightDictionary = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(dictionaryFile, context.folder), log); if (context.spaceModel == null) { String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation"; throw new ArgumentException(msg, nameof(context)); } if (context.query.isNullOrEmpty()) { context.query.QueryTerms = context.query.QueryTerms.Trim(); List <String> tkns = context.query.QueryTerms.getTokens(true, true, true, false, 4); foreach (String tkn in tkns) { queryTerms.Add(context.stemmingContext.Stem(tkn)); } } }
/// <summary> /// Scores the specified entry. /// </summary> /// <param name="entry">The entry.</param> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <returns></returns> public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log) { if (useMachineLearning) { WeightDictionary dc_vec = TermWeightModel.GetWeights(SelectedTerms.GetKeys(), entry.spaceDocument, context.spaceModel); var n_vec = fvConstructor.ConstructFeatureVector(dc_vec, entry.AssignedID); Double score = 0; Int32 l_id = -1; if (sc_id.ContainsKey(entry.AssignedID)) { l_id = sc_id[entry.AssignedID]; } score = classifier.DoScore(n_vec, log, l_id); return(score); } else { if (scoreDictionary.ContainsKey(entry.AssignedID)) { var fv = scoreDictionary[entry.AssignedID]; return(fv.CompressNumericVector(vectorCompression)); } else { return(0); } } }
/// <summary> /// Combines two or more precompiled document selection ranks /// </summary> /// <param name="inputNames">comma separated list of DS rank file names, leave empty if search pattern is used</param> /// <param name="searchPattern">file search pattern to select source files, leave * if no file search should be performed</param> /// <param name="compression">vector dimensions compression operation, i.e. how scores should be combined into single dimension</param> /// <param name="outputName">Name of the output.</param> /// <param name="doRankingFusion">if set to <c>true</c> [do ranking fusion].</param> /// <remarks> /// What it will do? /// </remarks> /// <seealso cref="aceOperationSetExecutorBase" /> public void aceOperation_makeCombineDSRanks( [Description("Space separated list of DS rank file names, leave empty if search pattern is used")] String inputNames = " ", [Description("vector dimensions compression operation, i.e. how scores should be combined into single dimension")] operation compression = operation.avg, [Description("Name of output Document Selection Rank file. Leave * to assign name as combination of input files")] String outputName = "*", [Description("If true, it will perform ranking fusion instead of simple score fusion")] Boolean doRankingFusion = true, [Description("file search pattern to select source files, leave * if no file search should be performed")] String searchPattern = "*" ) { SetupDocumentSelection setup = docSelection.data.CloneViaXML(); ProceduralFolderFor <ProcedureCreateScoreSet, SetupDocumentSelection, OperationContext, ExperimentModelExecutionContext> procedures = new ProceduralFolderFor <ProcedureCreateScoreSet, SetupDocumentSelection, OperationContext, ExperimentModelExecutionContext>(mainContext.folds, setup, mainContext.notes, parent); outputName = DocumentSelectResult.CheckAndMakeFilename(outputName); foreach (var p in procedures) { p.Open(); DocumentSelectResult resultOut = new DocumentSelectResult(); var fl = mainContext.resourceProvider.GetResourceFiles(inputNames, p.fold); List <DocumentSelectResult> results = DocumentRankingExtensions.LoadDSRankings(fl, p.notes); resultOut = results.Fusion(compression, doRankingFusion, true, p.notes); String pt = mainContext.resourceProvider.SetResourceFilePath(outputName, p.fold); resultOut.saveObjectToXML(pt); p.Close(); } }
/// <summary> /// Scores the specified entry. /// </summary> /// <param name="entry">The entry.</param> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <returns></returns> public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log) { Double score = 0; WebSiteGraph webSiteGraph = context.domainNameToGraph[entry.DomainID]; // GraphRegistry[entry.DomainID]; freeGraphNodeAndLinks outLinks = webSiteGraph.GetLinks(entry.AssignedID, true, false); freeGraphNodeAndLinks inLinks = webSiteGraph.GetLinks(entry.AssignedID, false, true); if (functionFlags.HasFlag(GraphFactorFunctionEnum.count_outbound)) { score += outLinks.Count; } if (functionFlags.HasFlag(GraphFactorFunctionEnum.count_inbound)) { score += inLinks.Count; } if (score == 0) { return(score); } if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_graphlinks)) { score = score / webSiteGraph.CountLinks(); } if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_graphnodes)) { score = score / webSiteGraph.CountNodes(); } if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_inbound)) { score = score / inLinks.Count; } if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_outbound)) { score = score / outLinks.Count; } if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_linkCount)) { score = score / (inLinks.Count + outLinks.Count); } return(score); }
public static List <DocumentSelectResult> LoadDSRankings(IEnumerable <String> filepaths, ILogBuilder output) { List <DocumentSelectResult> results = new List <DocumentSelectResult>(); foreach (var fp in filepaths) { var lr = DocumentSelectResult.LoadFromFile(fp, output); results.Add(lr); } return(results); }
/// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public void Prepare(DocumentSelectResult context, ILogBuilder log, bool excludeUnknown = true) { var nested_dict = context.GetModelsByCategoryDomainAssignedID(log); if (nested_dict.ContainsKey(SpaceLabel.UNKNOWN)) { nested_dict.Remove(SpaceLabel.UNKNOWN); } datasetModel = nested_dict.NestCompleteSpaceDocumentModel(context.name, log); datasetStatsModel = new SpaceDocumentStatsModel(datasetModel.name, log); datasetStatsModel.LearnFrom(datasetModel, log, true); }
/// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <exception cref="ArgumentException">context</exception> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { String p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder); String p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder); TermWeightModel = FeatureWeightModel.LoadModel(p_m, log); //if (File.Exists(p_m)) //{ // //TermWeightModel = objectSerialization.loadObjectFromXML<FeatureWeightModel>(p_m, log); //} TermWeightModel.Deploy(log); if (context.spaceModel == null) { String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation"; throw new ArgumentException(msg, nameof(context)); } if (File.Exists(p_d) && useStoredData) { WeightingModelDataSet data = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log); TermWeightModel.LoadModelDataSet(data, log); if (useSelectedFeatures) { SelectedTerms = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(modelDefinitionFile + "_sf", context.folder), log); } } else { TermWeightModel.PrepareTheModel(context.spaceModel, log); } if (context.query.isNullOrEmpty()) { context.query.QueryTerms = context.query.QueryTerms.Trim(); List <String> tkns = context.query.QueryTerms.getTokens(true, true, true, false, 4); foreach (String tkn in tkns) { queryTerms.Add(context.stemmingContext.Stem(tkn)); } } }
/* * /// <summary> * /// Prepares the context. * /// </summary> * /// <param name="context">The context.</param> * /// <param name="log">The log.</param> * /// <returns></returns> * public DocumentSelectResult PrepareContext(OperationContext context, ILogBuilder log) * { * DocumentSelectResult selectContext = new DocumentSelectResult(); * selectContext.stemmingContext = context.stemmContext; * selectContext.spaceModel = context.spaceModel; * selectContext.query = query; * * selectContext.selectedFeatures = context.SelectedFeatures; * * foreach (KeyValuePair<string, WebSiteDocuments> pair in context.webSiteByDomain) * { * selectContext.domainNameToGraph.Add(pair.Key, pair.Value.extensions.graph); * * foreach (WebSiteDocument doc in pair.Value.documents) * { * DocumentSelectResultEntry entry = new DocumentSelectResultEntry(); * TextDocument text = context.textDocuments[doc.AssociatedID]; * SpaceDocumentModel spaceDocument = context.spaceModel.documents.FirstOrDefault(x => x.name == doc.AssociatedID); * * string dn = pair.Value.domain; * entry.SetEntry(dn, doc, spaceDocument, text); * selectContext.Add(entry); * //entry.SetEntry( context.context.webDocumentByAssignedID[pair.Key], webDocIDToDomain[aID], webDocumentRegistry[aID], spaceDocumentRegistry[aID], textDocumentRegistry[aID]); * } * * } * * // PREPARATION OF MODEL * * model.Prepare(selectContext, log); * * return selectContext; * * } */ /* * /// <summary> * /// Prepares the context. * /// </summary> * /// <param name="space">The space.</param> * /// <param name="sites">The sites.</param> * /// <param name="documents">The documents.</param> * /// <param name="stemmingContext">The stemming context.</param> * /// <returns></returns> * public DocumentSelectResult PrepareContext(SpaceModel space, IEnumerable<WebSiteDocuments> sites, IEnumerable<TextDocument> documents, StemmingContext stemmingContext) * { * DocumentSelectResult context = new DocumentSelectResult(); * context.query = query; * * context.stemmingContext = stemmingContext; * context.spaceModel = space; * * List<String> associatedIDs = new List<string>(); * * Dictionary<String, TextDocument> textDocumentRegistry = new Dictionary<string, TextDocument>(); * foreach (TextDocument textDocument in documents) * { * textDocumentRegistry.Add(textDocument.name, textDocument); * } * * Dictionary<String, SpaceDocumentModel> spaceDocumentRegistry = new Dictionary<string, SpaceDocumentModel>(); * foreach (var textDocument in space.documents) * { * spaceDocumentRegistry.Add(textDocument.name, textDocument); * } * * * Dictionary<String, String> webDocIDToDomain = new Dictionary<string, string>(); * * Dictionary<String, WebSiteDocument> webDocumentRegistry = new Dictionary<string, WebSiteDocument>(); * * foreach (WebSiteDocuments site in sites) * { * context.domainNameToGraph.Add(site.domain, site.extensions.graph); * * foreach (WebSiteDocument webDocument in site.documents) * { * webDocumentRegistry.Add(webDocument.AssociatedID, webDocument); * associatedIDs.Add(webDocument.AssociatedID); * webDocIDToDomain.Add(webDocument.AssociatedID, site.domain); * } * } * * foreach (String aID in associatedIDs) * { * DocumentSelectResultEntry entry = new DocumentSelectResultEntry(); * entry.SetEntry(webDocIDToDomain[aID], webDocumentRegistry[aID], spaceDocumentRegistry[aID], textDocumentRegistry[aID]); * context.Add(entry); * } * * return context; * } */ public DocumentSelectResult ExecuteEvaluation(DocumentSelectResult context, ILogBuilder log) { // SCORE COMPUTATION foreach (IScoreModelFactor factor in model.Factors) { rangeFinder ranger = new rangeFinder(); foreach (DocumentSelectResultEntry entry in context.items) { Double score = factor.Score(entry, context, log); entry.SetScore(factor, score); if (score != Double.NaN) { if (factor.doNormalize) { ranger.Learn(score); } } } foreach (DocumentSelectResultEntry entry in context.items) { Double score = entry.GetScore(factor); if (ranger.Range != Double.NaN) { if (factor.doNormalize) { score = score - ranger.Minimum; score = score / ranger.Range; } } score = score * factor.weight; entry.SetScore(factor, score, false); } } foreach (DocumentSelectResultEntry entry in context.items) { entry.SumFactorScores(); } return(context); }
/// <summary> /// Loads multiple DocumentSelect results /// </summary> /// <param name="folder">The folder.</param> /// <param name="inputNames">The input names.</param> /// <param name="output">The output.</param> /// <param name="searchPattern">The search pattern.</param> /// <returns></returns> public static List <DocumentSelectResult> LoadDSRankings(folderNode folder, String inputNames, ILogBuilder output, String searchPattern = "DS_*_ranking.xml") { List <string> filepaths = folder.GetOrFindFiles(inputNames, searchPattern, SearchOption.TopDirectoryOnly); List <DocumentSelectResult> results = new List <DocumentSelectResult>(); foreach (var fp in filepaths) { var lr = DocumentSelectResult.LoadFromFile(fp, output); results.Add(lr); } return(results); }
/// <summary> /// Scores the specified entry. /// </summary> /// <param name="entry">The entry.</param> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <returns></returns> public override Double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log) { var entry_stats = statsByAssignedID[entry.AssignedID]; entry_stats.reCalculate(instanceCountCollection <string> .preCalculateTasks.all); Double score = 0; switch (functionName) { case ScoreModelMetricFactorEnum.varianceFreq: score = entry_stats.varianceFreq; break; case ScoreModelMetricFactorEnum.TotalScore: score = entry_stats.TotalScore; break; case ScoreModelMetricFactorEnum.standardDeviation: score = entry_stats.standardDeviation; break; case ScoreModelMetricFactorEnum.entropyFreq: score = entry_stats.entropyFreq; break; case ScoreModelMetricFactorEnum.avgFreq: score = entry_stats.avgFreq; break; case ScoreModelMetricFactorEnum.Count: score = entry_stats.Count; break; case ScoreModelMetricFactorEnum.Ordinal: score = assignedIDs.Count - assignedIDs.IndexOf(entry.AssignedID); break; default: score = entry_stats.Count; break; } return(score); }
/// <summary> /// Fusions the specified operation. /// </summary> /// <param name="scoreSet">The score set.</param> /// <param name="operation">The operation.</param> /// <param name="doRankingFusion">if set to <c>true</c> [do ranking fusion].</param> /// <param name="doDomainNormalization">if set to <c>true</c> [do domain normalization].</param> /// <param name="log">The log.</param> /// <returns></returns> public static DocumentSelectResult Fusion(this IEnumerable <DocumentSelectResult> scoreSet, operation operation, Boolean doRankingFusion, Boolean doDomainNormalization, ILogBuilder log) { List <DocumentSelectResultEntry> fusioned = null; if (doDomainNormalization) { log.log("Performing domain-level normalization over [" + scoreSet.Count() + "] document score sets"); foreach (DocumentSelectResult set in scoreSet) { DocumentRankingExtensions.NormalizeWithinDomain(set.items, log); } } if (doRankingFusion) { log.log("Ranking fusion over [" + scoreSet.Count() + "] document score sets"); fusioned = rankFusion(scoreSet.Select(x => x.items), log); } else { log.log("Score fusion over [" + scoreSet.Count() + "] document score sets"); fusioned = ScoreFusion(scoreSet.Select(x => x.items), operation, log); } DocumentSelectResult output = new DocumentSelectResult(); // scoreSet.First(); output.name = "ScoreFusionBy" + operation.ToString(); output.description = "Sets fusioned: "; foreach (var s in scoreSet) { output.description = s.name + " "; } output.items.AddRange(fusioned); return(output); }
public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log) { FeatureVectorWithLabelID fv = vectorDictionary.Get(entry.DomainID, entry.AssignedID); if (fv == null) { log.log("Can't find vector dictionary entry for [" + entry.DomainID + "]>[" + entry.AssignedID + "]"); return(0); } Double sc = 0; if (computation.HasFlag(ScoreComputationModeEnum.offset)) { sc = fv.CompressByTrueDimension(fv.labelID); } else if (computation.HasFlag(ScoreComputationModeEnum.variance)) { sc = fv.dimensions.GetVarianceCoefficient(); } else if (computation.HasFlag(ScoreComputationModeEnum.distance)) { sc = fv.CompressNumericVector(imbSCI.Core.enums.operation.max); } else { sc = fv.dimensions[0]; } if (computation.HasFlag(ScoreComputationModeEnum.inverse)) { sc = -sc; } if (sc == Double.NaN) { sc = 0; } else { } return(sc); }
/// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { var byDomain = context.GetByDomain(log); foreach (var pair in byDomain) { WebSiteGraph webSiteGraph = context.domainNameToGraph[pair.Key]; var matrix = webSiteGraph.GetIDMatrix(scoreUnit); p_matrix.Add(pair.Key, matrix); switch (algorithm) { case GraphFactorAlgorithm.HITS: HITSRank hits = new HITSRank(); hits.recalculate(matrix, convergence, steps); p_hits.Add(pair.Key, hits); break; case GraphFactorAlgorithm.PageRank: var pageRank = new PageRank(matrix.GetMatrix(), alpha, convergence, steps); double[] dbl = pageRank.ComputePageRank(); List <Int32> pri = new List <Int32>(); foreach (Double db in dbl) { pri.Add(Convert.ToInt32(db * scoreUnit)); } var ranks = new Dictionary <String, Int32>(); ranks = matrix.MapToX(pri); p_rank.Add(pair.Key, ranks); break; } } }
/// <summary> /// Evaluates the saved ds ranking. /// </summary> /// <param name="filepath">The filepath.</param> /// <param name="logger">The logger.</param> /// <param name="minDiversity">The minimum diversity.</param> /// <returns></returns> public static Boolean EvaluateSavedDSRanking(String filepath, ILogBuilder logger, Double minDiversity = 0.01) { DocumentSelectResult ds_loaded = null; filepath = filepath.Trim(); if (filepath.isNullOrEmpty()) { logger.log("EvaluateSavedDSRanking -- no filepath specified"); return(false); } if (!File.Exists(filepath)) { logger.log("Ranking scores not found at [" + filepath + "]"); return(false); } ds_loaded = DocumentSelectResult.LoadFromFile(filepath, logger); return(EvaluateDSRanking(ds_loaded, logger, filepath, minDiversity)); }
/// <summary> /// Prepares the factor by processing the context /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { statsByAssignedID.Clear(); foreach (DocumentSelectResultEntry docEntry in context.items) { instanceCountCollection <string> ft = new instanceCountCollection <string>(); if (docEntry.type.HasFlag(DocumentSelectEntryType.spaceDocument)) { SpaceDocumentModel document = docEntry.spaceDocument; foreach (var term in document.terms.GetTokens()) { ft.AddInstance(term, document.terms.GetTokenFrequency(term)); } } else if (docEntry.type.HasFlag(DocumentSelectEntryType.textDocument)) { String content = docEntry.textDocument.content; // document.ToString(); List <String> tkns = content.getTokens(true, true, true, false, 4); foreach (String tkn in tkns) { String stem = tkn; if (useStems) { stem = context.stemmingContext.Stem(tkn); } ft.AddInstance(stem); } } statsByAssignedID.Add(docEntry.AssignedID, ft); assignedIDs.Add(docEntry.AssignedID); } }
/// <summary> /// Evaluates the ds ranking. /// </summary> /// <param name="ds_loaded">The ds loaded.</param> /// <param name="logger">The logger.</param> /// <param name="filepath">The filepath.</param> /// <param name="minDiversity">The minimum diversity.</param> /// <returns></returns> public static Boolean EvaluateDSRanking(DocumentSelectResult ds_loaded, ILogBuilder logger, String filepath = "", Double minDiversity = 0.01) { if (filepath == "") { filepath = ds_loaded.name; } var distinct = ds_loaded.items.GetDistinctScores(); Int32 c = distinct.Count(); Boolean skip = true; if (distinct.Contains(Double.NaN)) { logger.log("Ranking scores [" + filepath + "] is refused as it contains NaN entries"); return(false); } if (c < 2) { logger.log("Ranking scores [" + filepath + "] is refused as it contains [" + c + "] distinct values"); return(false); } Double rate = c.GetRatio(ds_loaded.items.Count()); if (rate < minDiversity) { logger.log("Ranking scores [" + filepath + "] is refused for having [" + rate.ToString("F5") + "] below criterion [" + minDiversity.ToString("F2") + "]"); return(false); } logger.log("Ranking scores [" + filepath + "] accepted d=[" + rate.ToString("F5") + "] c=[" + distinct.Count + "] |e|=[" + ds_loaded.items.Count + "]"); return(true); }
public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log) { // WebSiteGraph webSiteGraph = context.domainNameToGraph[entry.DomainID]; Double score = 0; //var matrix = webSiteGraph.GetIDMatrix(); switch (algorithm) { case GraphFactorAlgorithm.HITS: if (p_hits.ContainsKey(entry.DomainID)) { HITSRank hits = p_hits[entry.DomainID]; if (hits.ContainsKey(entry.AssignedID)) { score = hits[entry.AssignedID] * scoreUnit; } } break; case GraphFactorAlgorithm.PageRank: if (p_rank[entry.DomainID].ContainsKey(entry.AssignedID)) { score = p_rank[entry.DomainID][entry.AssignedID]; } break; } return(score); }
/// <summary> /// Sorts entries by domain name /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <returns></returns> public static Dictionary <String, List <DocumentSelectResultEntry> > GetByDomain(this DocumentSelectResult context, ILogBuilder log) { return(context.items.GetByDomain(log)); }
/// <summary> /// The graph registry /// </summary> // protected Dictionary<String, WebSiteGraph> GraphRegistry = new Dictionary<string, WebSiteGraph>(); /// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { }
/// <summary> /// Gets nested dictionaries: [category][domain][assignedID] /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <returns></returns> public static Dictionary <String, Dictionary <String, Dictionary <String, SpaceDocumentModel> > > GetModelsByCategoryDomainAssignedID(this DocumentSelectResult context, ILogBuilder log) { Dictionary <String, Dictionary <String, Dictionary <String, DocumentSelectResultEntry> > > entries = context.GetByCategoryDomainAssignedID(log); Dictionary <String, Dictionary <String, Dictionary <String, SpaceDocumentModel> > > output = new Dictionary <string, Dictionary <string, Dictionary <string, SpaceDocumentModel> > >(); foreach (var pairCategory in entries) { var categoryDictionary = new Dictionary <string, Dictionary <string, SpaceDocumentModel> >(); foreach (var pairWebsite in pairCategory.Value) { var domainDictionary = new Dictionary <string, SpaceDocumentModel>(); foreach (var pairWebpage in pairWebsite.Value) { domainDictionary.Add(pairWebpage.Key, pairWebpage.Value.spaceDocument); } categoryDictionary.Add(pairWebsite.Key, domainDictionary); } output.Add(pairCategory.Key, categoryDictionary); } return(output); }
public abstract void Prepare(DocumentSelectResult context, ILogBuilder log);
public abstract Double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log);
/// <summary> /// Gets nested dictionaries: [category][domain][assignedID] /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <returns></returns> public static Dictionary <String, Dictionary <String, Dictionary <String, DocumentSelectResultEntry> > > GetByCategoryDomainAssignedID(this DocumentSelectResult context, ILogBuilder log) { Dictionary <String, Dictionary <String, Dictionary <String, DocumentSelectResultEntry> > > output = new Dictionary <string, Dictionary <string, Dictionary <string, DocumentSelectResultEntry> > >(); var byAssigned = context.GetByAssignedID(log); List <string> labels = context.spaceModel.LabelToDocumentLinks.GetAllDistinctNames(); foreach (String label in labels) { output.Add(label, new Dictionary <String, Dictionary <String, DocumentSelectResultEntry> >()); List <SpaceDocumentModel> linked_documents = context.spaceModel.LabelToDocumentLinks.GetAllLinkedB(label); List <DocumentSelectResultEntry> underLabel = new List <DocumentSelectResultEntry>(); foreach (var sdoc in linked_documents) { underLabel.Add(byAssigned[sdoc.name]); } var byDomain = underLabel.GetByDomain(log); foreach (var sitePair in byDomain) { output[label].Add(sitePair.Key, new Dictionary <string, DocumentSelectResultEntry>()); foreach (var pagePair in sitePair.Value) { output[label][sitePair.Key].Add(pagePair.AssignedID, pagePair); } } } return(output); }
//public static Dictionary<String, Dictionary<String, Dictionary<String, DocumentSelectResultEntry>>> GetByCategoryDomainAssignedID (this IEnumerable<DocumentSelectResultEntry> entries, SpaceModel model, ILogBuilder log) //{ // Dictionary<String, Dictionary<String, Dictionary<String, DocumentSelectResultEntry>>> output = new Dictionary<string, Dictionary<string, Dictionary<string, DocumentSelectResultEntry>>>(); // var byDomain = entries.GetByDomain(log); // var byCategoryAssignedID = GetByAssignIDCategory() //} /// <summary> /// Gets the by assign identifier category. /// </summary> /// <param name="context">The context.</param> /// <param name="catIndex">Index of the cat.</param> /// <param name="log">The log.</param> /// <returns></returns> public static Dictionary <String, List <DocumentSelectResultEntry> > GetByAssignIDCategory(this DocumentSelectResult context, Dictionary <string, List <string> > catIndex, ILogBuilder log) { Dictionary <String, List <DocumentSelectResultEntry> > output = new Dictionary <string, List <DocumentSelectResultEntry> >(); Dictionary <String, DocumentSelectResultEntry> byID = context.GetByAssignedID(log); foreach (var pair in catIndex) { output.Add(pair.Key, new List <DocumentSelectResultEntry>()); foreach (var k in pair.Value) { output[pair.Key].Add(byID[k]); } } return(output); }
/// <summary> /// Prepares the context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <returns></returns> public static DocumentSelectResult PrepareContext(this OperationContext context, DocumentRankingMethod ranking, folderNode folder, ILogBuilder log) { DocumentSelectResult selectContext = new DocumentSelectResult(); selectContext.stemmingContext = context.stemmContext; selectContext.spaceModel = context.spaceModel; selectContext.folder = folder; if (ranking != null) { selectContext.name = ranking.model.GetSignature(); selectContext.query = ranking.query; builderForText builder = new builderForText(); ranking.Describe(builder); builder.AppendLine("Selected features [" + selectContext.selectedFeatures.description + "]."); selectContext.description = builder.GetContent().Replace(Environment.NewLine, ""); } selectContext.selectedFeatures = context.SelectedFeatures; foreach (KeyValuePair <string, WebSiteDocuments> pair in context.webSiteByDomain) { selectContext.domainNameToGraph.Add(pair.Key, pair.Value?.extensions?.graph); foreach (WebSiteDocument doc in pair.Value.documents) { DocumentSelectResultEntry entry = new DocumentSelectResultEntry(); TextDocument text = null; string err = ""; //if (context.textDocuments.ContainsKey(doc.AssignedID)) //{ // text = context.textDocuments[doc.AssignedID]; //} //else //{ // err += "Failed to find text document for [" + doc.AssignedID + "]"; //} SpaceDocumentModel spaceDocument = context.spaceModel.documents.FirstOrDefault(x => x.name == doc.AssignedID); if (spaceDocument == null) { err += "Failed to find space model document for [" + doc.AssignedID + "]"; } string dn = pair.Value.domain; entry.SetEntry(dn, doc, spaceDocument, text); if (!entry.HasTextOrSpaceModel) { log.log(err); } selectContext.items.Add(entry); //entry.SetEntry( context.context.webDocumentByAssignedID[pair.Key], webDocIDToDomain[aID], webDocumentRegistry[aID], spaceDocumentRegistry[aID], textDocumentRegistry[aID]); } } // PREPARATION OF MODEL if (ranking != null) { ranking.model.Prepare(selectContext, log); } return(selectContext); }
public override void Prepare(DocumentSelectResult context, ILogBuilder log) { String p_m = ""; String p_d = ""; modelDefinitionFile = modelDefinitionFile.Replace("*", ""); if (!modelDefinitionFile.isNullOrEmpty()) { p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder); p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder); } if (TermWeightModel == null) { log.log("Loading model from [" + p_m + "]"); if (File.Exists(p_m)) { TermWeightModel = FeatureWeightModel.LoadModel(p_m, log); } } TermWeightModel.Deploy(log); if (File.Exists(p_d) && UseModelData) { log.log("Loading model data from [" + p_d + "]"); var dataset = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log); // WeightingModelDataSet TermWeightModel.LoadModelDataSet(dataset, log); } else { log.log("Preparing model ..."); TermWeightModel.PrepareTheModel(context.spaceModel, log); } if (computation.HasFlag(ScoreComputationModeEnum.category)) { vectorDictionary = context.TransformToFVDictionaryAsCategorySimilarity(TermWeightModel, function, log); } else if (computation.HasFlag(ScoreComputationModeEnum.site)) { vectorDictionary = context.TransformToFVDictionaryAsSiteSimilarity(TermWeightModel, function, log); } else if (computation.HasFlag(ScoreComputationModeEnum.pageDivergence)) { vectorDictionary = context.TransformToFVDictionaryAsPageSimilarity(TermWeightModel, function, ScoreComputationModeEnum.site, log); } else if (computation.HasFlag(ScoreComputationModeEnum.pagesOfCategory)) { vectorDictionary = context.TransformToFVDictionaryAsPageSimilarity(TermWeightModel, function, ScoreComputationModeEnum.category, log); } else if (computation.HasFlag(ScoreComputationModeEnum.pagesOfDataset)) { vectorDictionary = context.TransformToFVDictionaryAsPageSimilarity(TermWeightModel, function, ScoreComputationModeEnum.dataset, log); } log.log("Category similarity ready ... [" + computation.ToString() + "]"); }
/// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <exception cref="ArgumentException">context</exception> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { //context.folder.GetOrFindFiles("*", dictionaryFile + "*.xml"); scoreDictionary = FeatureVectorDictionaryWithDimensions.LoadFile(context.folder, dictionaryFile, log); // WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(dictionaryFile, context.folder), log); if (scoreDictionary == null) { String msg = "Error: Failed to find score dictionary [" + dictionaryFile + "] in " + context.folder.path; throw new ArgumentException(msg, nameof(context)); } if (useMachineLearning) { #region --------------- PREPARING TERM WEIGHT MODEL String p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder); String p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder); if (TermWeightModel == null) { TermWeightModel = FeatureWeightModel.LoadModel(p_m, log); } TermWeightModel.Deploy(log); if (context.spaceModel == null) { String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation"; throw new ArgumentException(msg, nameof(context)); } if (File.Exists(p_d) && useStoredData) { WeightingModelDataSet data = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log); TermWeightModel.LoadModelDataSet(data, log); if (useSelectedFeatures) { SelectedTerms = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(modelDefinitionFile + "_sf", context.folder), log); } } else { TermWeightModel.PrepareTheModel(context.spaceModel, log); } if (SelectedTerms.Count == 0) { SelectedTerms = context.selectedFeatures; } List <String> sel_tkns = new List <String>(); sel_tkns.AddRange(SelectedTerms.index.Values.Select(x => x.name)); if (!sel_tkns.Any()) { sel_tkns.AddRange(context.spaceModel.terms_known_label.GetTokens()); } #endregion fvConstructor.Deploy(featureMethod.constructor, sel_tkns); classifier = featureMethod.classifierSettings.GetClassifier(); sc_id = scoreDictionary.GetVectorsWithLabelID(null, criterion).ToNameVsLabelID(); List <FeatureVectorWithLabelID> trainingSet = new List <FeatureVectorWithLabelID>(); foreach (var item in context.items) { if (sc_id.ContainsKey(item.AssignedID)) { WeightDictionary dc_vec = TermWeightModel.GetWeights(sel_tkns, item.spaceDocument, context.spaceModel); var n_vec = fvConstructor.ConstructFeatureVector(dc_vec, item.AssignedID); FeatureVectorWithLabelID id_vec = new FeatureVectorWithLabelID(n_vec, sc_id[item.AssignedID]); trainingSet.Add(id_vec); } } log.log("Training [" + classifier.name + "] with [" + sc_id.Count + "] feature vectors."); classifier.DoTraining(trainingSet, log); } }