public void TestLinksInTextDocs() { string Url = @"https://nazuke.github.io/dummy.txt"; MacroscopeJobMaster JobMaster; MacroscopeDocumentCollection DocCollection; JobMaster = new MacroscopeJobMaster( JobRunTimeMode: MacroscopeConstants.RunTimeMode.LIVE, TaskController: this ); DocCollection = new MacroscopeDocumentCollection(JobMaster: JobMaster); MacroscopeDocument msDoc = DocCollection.CreateDocument(Url: Url); Assert.IsNotNull(msDoc, string.Format("FAIL: {0}", Url)); msDoc.ProcessPureTextOutlinks(TextDoc: this.TextDoc, LinkType: MacroscopeConstants.InOutLinkType.PURETEXT); foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { Assert.Contains(Outlink.GetTargetUrl(), this.TextLinks); } Assert.AreEqual(5, msDoc.CountOutlinks()); }
/** SEARCH INDEX: AND METHOD **********************************************/ public List <MacroscopeDocument> ExecuteSearchForDocumentsAND(string [] Terms) { List <MacroscopeDocument> DocList = new List <MacroscopeDocument> (); Dictionary <MacroscopeDocument, int> DocListGather = new Dictionary <MacroscopeDocument, int> (); for (int i = 0; i < Terms.Length; i++) { if (InvertedIndex.ContainsKey(Terms[i])) { foreach (string Url in InvertedIndex[Terms[i]].Keys) { MacroscopeDocument msDoc = InvertedIndex[Terms[i]][Url]; if (DocListGather.ContainsKey(msDoc)) { DocListGather[msDoc] = DocListGather[msDoc] + 1; } else { DocListGather.Add(msDoc, 1); } } } } foreach (MacroscopeDocument msDoc in DocListGather.Keys) { if (DocListGather[msDoc] == Terms.Length) { DocList.Add(msDoc); } } return(DocList); }
public void TestGoodKeywords() { foreach (string HtmlDocKey in this.GoodHtmlDocs.Keys) { MacroscopeDocument msDoc = new MacroscopeDocument(Url: "https://nazuke.github.io/"); string Html = this.GoodHtmlDocs[HtmlDocKey]; HtmlDocument HtmlDoc = new HtmlDocument(); msDoc.SetDocumentType(Type: MacroscopeConstants.DocumentType.HTML); HtmlDoc.LoadHtml(html: Html); List <string> CleanedText = msDoc.GetNodeText(Node: HtmlDoc.DocumentNode); string Keywords = HtmlDoc.DocumentNode.SelectSingleNode("//meta[@name='keywords']").GetAttributeValue(name: "content", def: ""); string BodyText = string.Join(" ", CleanedText.ToArray()); Assert.IsNotEmpty(Keywords, "Keywords is empty"); msDoc.SetKeywords(Keywords); msDoc.SetDocumentText(Text: BodyText); MacroscopeIntenseKeywordAnalysis Analyzer = new MacroscopeIntenseKeywordAnalysis(); List <KeyValuePair <string, MacroscopeIntenseKeywordAnalysis.KEYWORD_STATUS> > KeywordPresence = Analyzer.AnalyzeKeywordPresence(msDoc: msDoc); foreach (KeyValuePair <string, MacroscopeIntenseKeywordAnalysis.KEYWORD_STATUS> Pair in KeywordPresence) { Assert.AreEqual(MacroscopeIntenseKeywordAnalysis.KEYWORD_STATUS.PRESENT_IN_BODY_TEXT, Pair.Value); } } }
/** Render One ************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { lock (this.DisplayListViewLock) { ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(Url)) { lvItem = this.DisplayListView.Items[Url]; } if (lvItem != null) { int ColIndexInlinks = this.DisplayListView.Columns.IndexOfKey(MacroscopeConstants.Inlinks); int ColIndexOutlinks = this.DisplayListView.Columns.IndexOfKey(MacroscopeConstants.Outlinks); int ColIndexInhyperlinks = this.DisplayListView.Columns.IndexOfKey(MacroscopeConstants.HyperlinksIn); int ColIndexOuthyperlinks = this.DisplayListView.Columns.IndexOfKey(MacroscopeConstants.HyperlinksOut); lvItem.SubItems[ColIndexInlinks].Text = msDoc.CountInlinks().ToString(); lvItem.SubItems[ColIndexOutlinks].Text = msDoc.CountOutlinks().ToString(); lvItem.SubItems[ColIndexInhyperlinks].Text = msDoc.CountHyperlinksIn().ToString(); lvItem.SubItems[ColIndexOuthyperlinks].Text = msDoc.CountHyperlinksOut().ToString(); } } }
/**************************************************************************/ private bool CheckNodeAlreadyVisited( MacroscopeDocument msDoc, MacroscopeHyperlinkOut HyperlinkOut ) { bool Result = false; if (this.NodeVisited.ContainsKey(msDoc)) { if (this.NodeVisited[msDoc].Contains(HyperlinkOut)) { Result = true; } else { this.NodeVisited[msDoc].Add(HyperlinkOut); } } else { this.NodeVisited[msDoc] = new List <MacroscopeHyperlinkOut> (); this.NodeVisited[msDoc].Add(HyperlinkOut); } return(Result); }
public async Task TestHtmlDocument() { MacroscopeJobMaster JobMaster; MacroscopeDocumentCollection DocCollection; List <string> UrlList = new List <string>(); UrlList.Add("https://nazuke.github.io/"); JobMaster = new MacroscopeJobMaster( JobRunTimeMode: MacroscopeConstants.RunTimeMode.LIVE, TaskController: this ); DocCollection = new MacroscopeDocumentCollection(JobMaster: JobMaster); foreach (string Url in UrlList) { MacroscopeDocument msDoc = DocCollection.CreateDocument(Url: Url); Assert.IsNotNull(msDoc, string.Format("FAIL: {0}", Url)); bool ExecuteResult = await msDoc.Execute(); Assert.IsTrue(ExecuteResult, string.Format("FAIL: {0}", "Execute()")); Assert.AreEqual(Url, msDoc.GetUrl(), string.Format("FAIL: {0}", Url)); Assert.IsTrue(msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML), string.Format("FAIL: {0}", Url)); } }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocument msDoc, string Url ) { }
/**************************************************************************/ private void BuildWorksheetXpaths( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField(MacroscopeConstants.Url); ws.WriteField(MacroscopeConstants.StatusCode); ws.WriteField(MacroscopeConstants.Status); ws.WriteField(MacroscopeConstants.ContentType); ws.WriteField("Extracted Label"); ws.WriteField("Extracted Value"); ws.NextRecord(); } foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url); string DocUrl = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string MimeType = msDoc.GetMimeType(); if (!this.DataExtractorXpaths.CanApplyDataExtractorsToDocument(msDoc: msDoc)) { continue; } foreach (KeyValuePair <string, string> DataExtractedPair in msDoc.IterateDataExtractedXpaths()) { string ExtractedLabel = DataExtractedPair.Key; string ExtractedValue = DataExtractedPair.Value; if ( string.IsNullOrEmpty(ExtractedLabel) || string.IsNullOrEmpty(ExtractedValue)) { continue; } this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(Status)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(MimeType)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(ExtractedLabel)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(ExtractedValue)); ws.NextRecord(); } } }
/**************************************************************************/ public void RemoveDocument(MacroscopeDocument msDoc) { lock (this.DocumentChain) { this.DocumentChain.Remove(msDoc); } }
/**************************************************************************/ private bool CrossCheckDocuments(MacroscopeDocument msDocCompare) { bool CrossChecked = false; string Key1 = string.Join(":", this.msDocOriginal.GetChecksum(), msDocCompare.GetChecksum()); string Key2 = string.Join(":", msDocCompare.GetChecksum(), this.msDocOriginal.GetChecksum()); lock (this.CrossCheck) { if (this.CrossCheck.ContainsKey(Key1)) { CrossChecked = true; } else { this.CrossCheck.Add(Key1, true); } if (this.CrossCheck.ContainsKey(Key2)) { CrossChecked = true; } else { this.CrossCheck.Add(Key2, true); } } return(CrossChecked); }
/**************************************************************************/ public static string CleanDocumentText(MacroscopeDocument msDoc) { string CleanedText = msDoc.GetDocumentTextRaw(); if (!string.IsNullOrEmpty(CleanedText)) { try { CleanedText = HtmlEntity.DeEntitize(CleanedText); } catch (System.Collections.Generic.KeyNotFoundException ex) { DebugMsgStatic(string.Format("CleanDocumentText: {0}", ex.Message)); msDoc.AddRemark("CleanDocumentText", "Possibly contains invalid HTML Entities."); } catch (Exception ex) { DebugMsgStatic(string.Format("CleanDocumentText: {0}", ex.Message)); msDoc.AddRemark("CleanDocumentText", "Possibly contains invalid HTML Entities."); } CleanedText = CleanText(Text: CleanedText); } return(CleanedText); }
/**************************************************************************/ public void InsertAndFormatUrlCell( CsvWriter ws, MacroscopeDocument msDoc ) { ws.WriteField(msDoc.GetUrl()); }
/**************************************************************************/ public void AddDocument(MacroscopeDocument msDoc) { lock (this.DocumentChain) { this.DocumentChain.AddLast(msDoc); } }
public void TestDetectLanguage() { List <string> UrlList = new List <string> (); UrlList.Add("https://nazuke.github.io/SEOMacroscope/"); MacroscopePreferencesManager.SetDetectLanguage(Enabled: true); MacroscopePreferencesManager.SetRequestTimeout(Seconds: 10); for (int i = 0; i < 10; i++) { foreach (string Url in UrlList) { MacroscopeDocument msDoc = new MacroscopeDocument(Url: Url); Assert.IsNotNull(msDoc, string.Format("FAIL: {0}", Url)); Assert.IsTrue(msDoc.Execute(), string.Format("FAIL: {0}", "Execute()")); Assert.IsTrue(msDoc.GetIsHtml(), string.Format("FAIL: {0}", Url)); Assert.IsNotNullOrEmpty(msDoc.GetTitle(), string.Format("FAIL: {0}", msDoc.GetTitle())); string LanguageTitle = msDoc.GetTitleLanguage(); string LanguageDescription = msDoc.GetDescriptionLanguage(); string LanguageBodyText = msDoc.GetDocumentTextLanguage(); Assert.AreEqual("en", LanguageTitle, string.Format("FAIL: {0} :: {1}", "LanguageTitle", LanguageTitle)); Assert.AreEqual("en", LanguageDescription, string.Format("FAIL: {0} :: {1}", "LanguageDescription", LanguageDescription)); Assert.AreEqual("en", LanguageBodyText, string.Format("FAIL: {0} :: {1}", "LanguageBodyText", LanguageBodyText)); } } }
/** Render One Document *******************************************/ public virtual void RenderListView(MacroscopeDocument msDoc, string Url) { if (msDoc == null) { return; } List <ListViewItem> ListViewItems = new List <ListViewItem> (1); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = ( decimal )1; decimal MajorPercentage = (( decimal )100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.ControlBox = false; ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing document collection for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } Application.DoEvents(); if (msDoc != null) { this.RenderListView( ListViewItems: ListViewItems, msDoc: msDoc, Url: msDoc.GetUrl() ); } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; MajorPercentage = (( decimal )100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } ProgressForm.Dispose(); }
/**************************************************************************/ public MacroscopeLevenshteinFingerprint(MacroscopeDocument msDoc) { this.SuppressDebugMsg = true; this.Document = msDoc; this.Fingerprint = ""; this.FingerprintLocker = new Object(); }
/**************************************************************************/ public void InsertAndFormatRedirectCell( CsvWriter ws, MacroscopeDocument msDoc ) { string Value = msDoc.GetIsRedirect().ToString(); ws.WriteField(Value); }
/**************************************************************************/ public void InsertAndFormatRobotsCell( CsvWriter ws, MacroscopeDocument msDoc ) { string Value = msDoc.GetAllowedByRobotsAsString(); ws.WriteField(Value); }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { // NO-OP }
/**************************************************************************/ private void BuildWorksheetSitemapErrors( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Sitemap URL"); ws.WriteField("Status Code"); ws.WriteField("Robots"); ws.WriteField("URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsInternal() && msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML)) { foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { string TargetUrl = Outlink.GetTargetUrl(); MacroscopeDocument msDocLinked = DocCollection.GetDocumentByUrl(Url: TargetUrl); bool InsertRow = false; if (msDocLinked.GetIsInternal()) { int StatusCode = (int)msDocLinked.GetStatusCode(); if ((StatusCode >= 400) && (StatusCode <= 599)) { InsertRow = true; } if (!msDocLinked.GetAllowedByRobots()) { InsertRow = true; } } if (InsertRow) { this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatRobotsCell(ws, msDoc); this.InsertAndFormatUrlCell(ws, TargetUrl); ws.NextRecord(); } } } } }
/**************************************************************************/ private void ProcessOutlinks(MacroscopeDocument msDoc) { if ( (this.JobMaster.GetRunTimeMode() == MacroscopeConstants.RunTimeMode.LISTFILE) || (this.JobMaster.GetRunTimeMode() == MacroscopeConstants.RunTimeMode.LISTTEXT) || (this.JobMaster.GetRunTimeMode() == MacroscopeConstants.RunTimeMode.SITEMAP)) { if (!MacroscopePreferencesManager.GetScanSitesInList()) { return; } } foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { Boolean Proceed = true; if (!Outlink.GetDoFollow()) { continue; } if (Outlink.GetTargetUrl() == null) { continue; } if (this.JobMaster.GetJobHistory().SeenHistoryItem(Outlink.GetTargetUrl())) { continue; } if (this.JobMaster.GetPageLimit() > -1) { if (this.JobMaster.GetPageLimitCount() >= this.JobMaster.GetPageLimit()) { this.DebugMsg( string.Format( "PAGE LIMIT REACHED: {0} :: {1}", this.JobMaster.GetPageLimit(), this.JobMaster.GetPageLimitCount() ) ); Proceed = false; } } if (Proceed) { this.JobMaster.AddUrlQueueItem( Url: Outlink.GetTargetUrl(), Check: true ); } } }
/**************************************************************************/ public MacroscopeClickPathAnalysis(MacroscopeDocumentCollection DocumentCollection) { this.DocCollection = DocumentCollection; this.RootDoc = null; this.NodeVisited = new Dictionary <MacroscopeDocument, List <MacroscopeHyperlinkOut> > (); this.PageChains = new SortedDictionary <string, List <LinkedList <string> > > (); }
/**************************************************************************/ public MacroscopeDocument GetDocument(string Url) { MacroscopeDocument msDoc = null; if (this.DocumentList.ContainsKey(Url)) { msDoc = this.DocumentList[Url]; } return(msDoc); }
/**************************************************************************/ public MacroscopeDocument GetLastDocument() { MacroscopeDocument msDoc = null; lock (this.DocumentChain) { msDoc = this.DocumentChain.Last.Value; } return(msDoc); }
/**************************************************************************/ public void Analyze(MacroscopeDocument RootDoc) { LinkedList <string> PageChain = new LinkedList <string> (); this.RootDoc = RootDoc; this.NodeVisited.Clear(); this.PageChains.Clear(); this.Descend( PageChain: PageChain, ParentDoc: RootDoc ); this.DebugMsg("######################################################"); // TODO: Remove this after debugging: foreach (string Url in this.PageChains.Keys) { this.DebugMsg(string.Format("PageChains URL: {0}", Url)); int Count = 0; foreach (LinkedList <string> Chain in this.PageChains[Url]) { this.DebugMsg(string.Format("----{0}: {1}", Count, Url)); foreach (string ChainedUrl in Chain) { this.DebugMsg(string.Format("--------ChainedUrl: {0}", ChainedUrl)); } Count++; } } this.DebugMsg("######################################################"); /* * // TODO: Remove this after debugging: * foreach( string Url in this.PageChains.Keys ) * { * this.DebugMsg( string.Format( "PageChains URL: {0}", Url ) ); * int Count = 0; * foreach( LinkedList<string> Chain in this.PageChains[Url] ) * { * this.DebugMsg( string.Format( "----{0}: {1}", Count, Chain.Count ) ); * Count++; * } * } */ this.DebugMsg("######################################################"); return; }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { string Title = msDoc.GetTitle(); string Description = msDoc.GetDescription(); string Keywords = msDoc.GetKeywords(); string PairKey = string.Join("", Url); ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[0].Text = Url; lvItem.SubItems[1].Text = Title; lvItem.SubItems[2].Text = Description; lvItem.SubItems[3].Text = Keywords; } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplaySearchCollection 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[0].Text = Url; lvItem.SubItems.Add(Title); lvItem.SubItems.Add(Description); lvItem.SubItems.Add(Keywords); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplaySearchCollection 2: {0}", ex.Message)); } } //this.DocumentCount.Text = string.Format( "Documents: {0}", DisplayListView.Items.Count ); }
/**************************************************************************/ public void RemoveDocument(MacroscopeDocument msDoc) { string Url = msDoc.GetUrl(); lock (this.DocumentList) { if (this.DocumentList.ContainsKey(Url)) { this.DocumentList.Remove(Url); } } }
public void TestGetNodeText() { foreach (string HtmlDocKey in this.HtmlDocs.Keys) { MacroscopeDocument msDoc = new MacroscopeDocument(Url: "https://nazuke.github.io/"); string Html = this.HtmlDocs[HtmlDocKey]; HtmlDocument HtmlDoc = new HtmlDocument(); HtmlDoc.LoadHtml(html: Html); List <string> CleanedText = msDoc.GetNodeText(Node: HtmlDoc.DocumentNode); Assert.IsNotEmpty(CleanedText, "CleanedText is empty"); } }
/**************************************************************************/ public static IMacroscopeAnalyzeReadability AnalyzerFactory(MacroscopeDocument msDoc) { IMacroscopeAnalyzeReadability Analyzer = null; string IsoLanguageCode = msDoc.GetIsoLanguageCode(); if (!string.IsNullOrEmpty(IsoLanguageCode)) { Analyzer = MacroscopeAnalyzeReadability.AnalyzerFactory(IsoLanguageCode: IsoLanguageCode); } return(Analyzer); }
/**************************************************************************/ private void BuildWorksheetPageRedirectsAudit( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Origin URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Destination URL"); ws.NextRecord(); } foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url: Url); if (!msDoc.GetIsRedirect()) { continue; } string OriginURL = msDoc.GetUrlRedirectFrom(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string DestinationURL = msDoc.GetUrlRedirectTo(); if (string.IsNullOrEmpty(OriginURL)) { continue; } if (string.IsNullOrEmpty(DestinationURL)) { continue; } this.InsertAndFormatUrlCell(ws, OriginURL); this.InsertAndFormatContentCell(ws, StatusCode); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatUrlCell(ws, DestinationURL); ws.NextRecord(); } }