/**************************************************************************/ public static string CleanDocumentText(MacroscopeDocument msDoc) { string CleanedText = msDoc.GetDocumentTextRaw(); if (!string.IsNullOrEmpty(CleanedText)) { try { CleanedText = HtmlEntity.DeEntitize(CleanedText); } catch (System.Collections.Generic.KeyNotFoundException ex) { DebugMsgStatic(string.Format("CleanDocumentText: {0}", ex.Message)); msDoc.AddRemark("CleanDocumentText", "Possibly contains invalid HTML Entities."); } catch (Exception ex) { DebugMsgStatic(string.Format("CleanDocumentText: {0}", ex.Message)); msDoc.AddRemark("CleanDocumentText", "Possibly contains invalid HTML Entities."); } CleanedText = CleanText(Text: CleanedText); } return(CleanedText); }
/**************************************************************************/ public void Analyze() { string Text = Document.GetDocumentTextRaw(); SortedDictionary <char, int> Tokens; char[] Characters; lock (this.FingerprintLocker) { this.Fingerprint = ""; if (!string.IsNullOrEmpty(Text)) { Characters = Text.ToLower().ToCharArray(); Tokens = new SortedDictionary <char, int>(); foreach (char Token in Characters) { if (Tokens.ContainsKey(Token)) { Tokens[Token] = Tokens[Token] + 1; } else { Tokens[Token] = 1; } } foreach (char Token in Tokens.Keys) { this.Fingerprint = this.Fingerprint + string.Format( "{0}:{1}\n", Token, Tokens[Token] ); } } } this.DebugMsg(this.Fingerprint); return; }
public MacroscopeLevenshteinAnalysis( MacroscopeDocument msDoc, int SizeDifference, int Threshold, Dictionary <string, Boolean> CrossCheckList ) { this.SuppressDebugMsg = true; this.msDocOriginal = msDoc; this.MonstrousText = msDoc.GetDocumentTextRaw().ToLower(); this.Monster = new Levenshtein(MonstrousText); this.ComparisonSizeDifference = SizeDifference; this.ComparisonThreshold = Threshold; this.CrossCheck = CrossCheckList; this.PercentageDone = null; }