예제 #1
0
        /// <summary>
        /// Instructs HTML to text extraction engine (EntityPlaneMethod) to produce text from xpath
        /// </summary>
        /// <param name="name">Instruction name, it is human-readable descriptive name or special instructin name like ::BODYTEXT::</param>
        /// <param name="flags">The flags.</param>
        /// <param name="code">XPath associated with the instruction, selects nodes to be rendered into text</param>
        /// <param name="weight">Weight factor of the instruction, i.e. number of times the content should be repeated (boosting TF)</param>
        /// <param name="expansion">The expansion.</param>
        /// <param name="remove">if set to <c>true</c> [remove].</param>
        /// <remarks>
        /// It will add specified instruction to the rendering instruction set, and optionally remove all existing instructions before it.
        /// </remarks>
        /// <seealso cref="aceOperationSetExecutorBase" />
        public void aceOperation_setRenderInstruction(
            [Description("Instruction name, it is human-readable descriptive name or special instructin name like ::BODYTEXT::")] String name = "::BODYTEXT::",
            [Description("Instruction flags, controls what and how to render")] DocumentRenderInstructionFlags flags   = DocumentRenderInstructionFlags.this_page_content,
            [Description("XPath associated with the instruction, selects nodes to be rendered into text")] String code = "",
            [Description("Weight factor of the instruction, i.e. number of times the content should be repeated (boosting TF)")] Double weight = 1.0,
            [Description("Graph selection expansion steps - to reach 1+ edges far nodes ")] Int32 expansion = 1,
            [Description("If true it will remove any existing instruction in the set")] Boolean remove      = false)
        {
            if (remove)
            {
                data.instructions.Clear();
            }

            DocumentRenderInstruction dri = new DocumentRenderInstruction(name, code, weight);

            dri.instructionFlags    = flags;
            dri.graphExpansionSteps = expansion;
            dri.weight = weight;
            data.instructions.Add(dri);
        }
예제 #2
0
        /// <summary>
        /// Gets default configuration
        /// </summary>
        /// <returns></returns>
        public static PlanesMethodSettings GetDefaultSettings()
        {
            PlanesMethodSettings output = new PlanesMethodSettings();

            output.entityMethod.instructions.Add(DocumentRenderInstruction.GetDescriptionInstruction());
            output.entityMethod.instructions.Add(DocumentRenderInstruction.GetTitleInstruction());
            output.entityMethod.instructions.Add(DocumentRenderInstruction.GetBodyTextInstruction());

            //  output.entityMethod.blenderOptions = DocumentBlenderFunctionOptions.binaryAggregation | DocumentBlenderFunctionOptions.pageLevel;
            output.entityMethod.filterFunctionName = ""; // nameof(DocumentEntropyFunction);
            output.entityMethod.filterLimit        = 5;



            output.corpusMethod.stemmer   = nameof(EnglishStemmer);
            output.corpusMethod.tokenizer = nameof(TokenizerBasic);
            output.corpusMethod.transliterationRuleSetId = "";

            #region PREPARE Weighting model
            var weightModel = new FeatureWeightModel();
            weightModel.LocalFunction = new Weighting.Local.TermFrequencyFunction();

            var globalFactor = new FeatureWeightFactor();
            globalFactor.Settings.functionName = nameof(IDFElement);
            weightModel.GlobalFactors.Add(globalFactor);

            output.corpusMethod.WeightModel = weightModel;
            #endregion


            var featureFilter = new FeatureFilter();
            featureFilter.limit = 8000;
            //featureFilter.
            //featureFilter.functionSettings = new GlobalFunctionSettings();
            //featureFilter.functionSettings.functionName = nameof(CollectionTDPElement);
            //featureFilter.functionSettings.weight = 1.0;
            //featureFilter.functionSettings.flags.Add(Weighting.Metrics.TDPFactor.chi.ToString());
            output.corpusMethod.filter = featureFilter;

            /*
             * output.vectorMethod.constructor = new Feature.Settings.FeatureVectorConstructorSettings();
             * dimensionSpecification dimSpec = new dimensionSpecification();
             * dimSpec.functionName = nameof(CosineSimilarityFunction);
             * dimSpec.type = FeatureVectorDimensionType.similarityFunction;
             * output.vectorMethod.constructor.labelDimensions.Add(dimSpec);
             */

            //output.vectorMethod.constructor = new Feature.Settings.FeatureVectorConstructorSettings();
            //dimensionSpecification dimSpec = new dimensionSpecification();
            ////dimSpec.functionName = nameof(CosineSimilarityFunction);
            //dimSpec.type = FeatureVectorDimensionType.directTermWeight;
            //output.vectorMethod.constructor.featureDimensions.Add(dimSpec);


            output.featureMethod.classifierSettings.type = Classifiers.ClassifierType.multiClassSVM;
            output.featureMethod.classifierSettings.lossFunctionForTraining = Accord.MachineLearning.VectorMachines.Learning.Loss.L2;


            /*
             * output.featureMethod.classifierSettings.type = Classifiers.ClassifierType.kNearestNeighbors;
             * output.featureMethod.classifierSettings.lossFunctionForTraining = Accord.MachineLearning.VectorMachines.Learning.Loss.L2;
             * output.featureMethod.classifierSettings.kNN_k = 4;
             */

            return(output);
        }