コード例 #1
0
        /**************************************************************************/

        public static string CleanDocumentText(MacroscopeDocument msDoc)
        {
            string CleanedText = msDoc.GetDocumentTextRaw();

            if (!string.IsNullOrEmpty(CleanedText))
            {
                try
                {
                    CleanedText = HtmlEntity.DeEntitize(CleanedText);
                }
                catch (System.Collections.Generic.KeyNotFoundException ex)
                {
                    DebugMsgStatic(string.Format("CleanDocumentText: {0}", ex.Message));
                    msDoc.AddRemark("CleanDocumentText", "Possibly contains invalid HTML Entities.");
                }
                catch (Exception ex)
                {
                    DebugMsgStatic(string.Format("CleanDocumentText: {0}", ex.Message));
                    msDoc.AddRemark("CleanDocumentText", "Possibly contains invalid HTML Entities.");
                }

                CleanedText = CleanText(Text: CleanedText);
            }

            return(CleanedText);
        }
コード例 #2
0
        /**************************************************************************/

        public void Analyze()
        {
            string Text = Document.GetDocumentTextRaw();
            SortedDictionary <char, int> Tokens;

            char[] Characters;

            lock (this.FingerprintLocker)
            {
                this.Fingerprint = "";

                if (!string.IsNullOrEmpty(Text))
                {
                    Characters = Text.ToLower().ToCharArray();
                    Tokens     = new SortedDictionary <char, int>();

                    foreach (char Token in Characters)
                    {
                        if (Tokens.ContainsKey(Token))
                        {
                            Tokens[Token] = Tokens[Token] + 1;
                        }
                        else
                        {
                            Tokens[Token] = 1;
                        }
                    }

                    foreach (char Token in Tokens.Keys)
                    {
                        this.Fingerprint = this.Fingerprint + string.Format(
                            "{0}:{1}\n",
                            Token,
                            Tokens[Token]
                            );
                    }
                }
            }

            this.DebugMsg(this.Fingerprint);

            return;
        }
コード例 #3
0
        public MacroscopeLevenshteinAnalysis(
            MacroscopeDocument msDoc,
            int SizeDifference,
            int Threshold,
            Dictionary <string, Boolean> CrossCheckList
            )
        {
            this.SuppressDebugMsg = true;

            this.msDocOriginal            = msDoc;
            this.MonstrousText            = msDoc.GetDocumentTextRaw().ToLower();
            this.Monster                  = new Levenshtein(MonstrousText);
            this.ComparisonSizeDifference = SizeDifference;
            this.ComparisonThreshold      = Threshold;

            this.CrossCheck = CrossCheckList;

            this.PercentageDone = null;
        }