public void TestAnalyze3() { MacroscopeDocument msDoc = new MacroscopeDocument("http://localhost/"); msDoc.SetDocumentText(Text: BodyText); MacroscopeDeepKeywordAnalysis AnalyzeKeywords = new MacroscopeDeepKeywordAnalysis(); Dictionary <string, int> Terms = new Dictionary <string, int> (256); AnalyzeKeywords.Analyze(Text: msDoc.GetDocumentTextCleaned(), Terms: Terms, Words: 3); foreach (string Term in Terms.Keys) { this.DebugMsg(string.Format("TOTALS 2: {0} :: {1}", Terms[Term], Term)); } }
/**************************************************************************/ public List <KeyValuePair <string, KEYWORD_STATUS> > AnalyzeKeywordPresence(MacroscopeDocument msDoc) { string Keywords = msDoc.GetKeywords().ToLower(); string BodyText = msDoc.GetDocumentTextCleaned().ToLower(); List <string> KeywordsList = new List <string>(); List <KeyValuePair <string, KEYWORD_STATUS> > KeywordPresence = new List <KeyValuePair <string, KEYWORD_STATUS> >(); bool KeywordsMetatagEmpty = false; foreach (string Keyword in Keywords.Split(',')) { string KeywordCleaned = MacroscopeStringTools.CleanWhiteSpace(Keyword); KeywordsList.Add(KeywordCleaned); KeywordsMetatagEmpty = true; } if (KeywordsMetatagEmpty) { foreach (string Keyword in KeywordsList) { string kw = this.GetPatternForLanguage(msDoc: msDoc, Keyword: Keyword); if (Regex.IsMatch(BodyText, kw)) { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>(Keyword, KEYWORD_STATUS.PRESENT_IN_BODY_TEXT)); } else { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>(Keyword, KEYWORD_STATUS.MISSING_IN_BODY_TEXT)); } } } else { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>("", KEYWORD_STATUS.KEYWORDS_METATAG_EMPTY)); } return(KeywordPresence); }
/**************************************************************************/ private void ProcessText(MacroscopeDocument msDoc) { List <string> TextBlocks = new List <string> (16); List <string> Terms = new List <string> (256); bool CaseSensitive = MacroscopePreferencesManager.GetCaseSensitiveTextIndexing(); TextBlocks.Add(msDoc.GetTitle()); TextBlocks.Add(msDoc.GetDescription()); TextBlocks.Add(msDoc.GetKeywords()); TextBlocks.Add(msDoc.GetDocumentTextCleaned()); DebugMsg(string.Format("ProcessText: TextBlocks.Count: {0}", TextBlocks.Count)); if (TextBlocks.Count > 0) { for (int i = 0; i < TextBlocks.Count; i++) { string [] Chunk = TextBlocks[i].Split(' '); if (Chunk.Length > 0) { for (int j = 0; j < Chunk.Length; j++) { if (Chunk[j].Length > 0) { if (!Terms.Contains(Chunk[j])) { Terms.Add(Chunk[j]); } } } } } } DebugMsg(string.Format("ProcessText: Words :: {0}", Terms.Count)); for (int i = 0; i < Terms.Count; i++) { Dictionary <string, MacroscopeDocument> DocumentReference; string Term = Terms[i]; if (!CaseSensitive) { Term = Term.ToLower(); } DebugMsg(string.Format("ProcessText: Term :: {0}", Term)); if (InvertedIndex.ContainsKey(Term)) { DocumentReference = this.InvertedIndex[Term]; } else { DocumentReference = new Dictionary <string, MacroscopeDocument> (); this.InvertedIndex.Add(Term, DocumentReference); } if (!DocumentReference.ContainsKey(msDoc.GetUrl())) { DocumentReference.Add(msDoc.GetUrl(), msDoc); } } }
/**************************************************************************/ public List <KeyValuePair <string, KEYWORD_STATUS> > AnalyzeKeywordPresence(MacroscopeDocument msDoc) { string Keywords = msDoc.GetKeywords().ToLower(); string TitleText = msDoc.GetTitle().ToLower(); string DescriptionText = msDoc.GetDescription().ToLower(); string BodyText = msDoc.GetDocumentTextCleaned().ToLower(); List <string> KeywordsList = new List <string>(); List <KeyValuePair <string, KEYWORD_STATUS> > KeywordPresence = new List <KeyValuePair <string, KEYWORD_STATUS> >(); bool KeywordsMetatagFilled = false; foreach (string Keyword in Keywords.Split(',')) { string KeywordCleaned = MacroscopeStringTools.CleanWhiteSpace(Keyword); if (KeywordCleaned.Length > 0) { KeywordsList.Add(KeywordCleaned); KeywordsMetatagFilled = true; } } if (KeywordsMetatagFilled) { foreach (string Keyword in KeywordsList) { try { string kw = this.GetPatternForLanguage(msDoc: msDoc, Keyword: Keyword); if (Regex.IsMatch(TitleText, kw)) { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>(Keyword, KEYWORD_STATUS.PRESENT_IN_TITLE)); } else { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>(Keyword, KEYWORD_STATUS.MISSING_IN_TITLE)); } if (Regex.IsMatch(DescriptionText, kw)) { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>(Keyword, KEYWORD_STATUS.PRESENT_IN_DESCRIPTION)); } else { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>(Keyword, KEYWORD_STATUS.MISSING_IN_DESCRIPTION)); } if (Regex.IsMatch(BodyText, kw)) { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>(Keyword, KEYWORD_STATUS.PRESENT_IN_BODY)); } else { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>(Keyword, KEYWORD_STATUS.MISSING_IN_BODY)); } } catch (Exception ex) { this.DebugMsg(ex.Message); KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>(Keyword, KEYWORD_STATUS.MALFORMED_KEYWORDS_METATAG)); } } } else { KeywordPresence.Add(new KeyValuePair <string, KEYWORD_STATUS>("", KEYWORD_STATUS.KEYWORDS_METATAG_EMPTY)); } return(KeywordPresence); }