ArticleHeader, Crawler.Code C# (CSharp)のコード例

コード例 #1

0

ファイルを表示

        private IEnumerable <ArticleHeader> parsePageInternal(string pageData)
        {
            // Prepare to parse
            var html = new HtmlDocument();

            // Load the HTML string
            html.LoadHtml(pageData);

            // Look for all the article posts
            var root             = html.DocumentNode;
            var articleSummaries = root.Descendants()
                                   .Where(n => n.GetAttributeValue("class", "").Equals(Constants.SIGNATURE_POST));

            // If nothing is found return
            if (articleSummaries == null || articleSummaries.Count() == 0)
            {
                return(null);
            }

            // Prepare the result collection
            var results = new List <ArticleHeader>(articleSummaries.Count());

            // Find data around each header
            int curArticleIndex = 0;

            foreach (var header in articleSummaries)
            {
                // Check if we are not beyond the limit
                if (curArticleIndex > Constants.MAX_ARTICLES)
                {
                    break;
                }

                // Try to parse an article header
                // If something goes wrong just report and move on
                try
                {
                    // get the title string
                    string articleTitle = header.InnerText;

                    // Clean it a bit
                    articleTitle = HttpUtility.HtmlDecode(articleTitle.Replace("&nbsp;", " "));

                    // Extract the hyper link
                    HtmlNode articleHref = header.Descendants("a")
                                           .Single();

                    // Find the authors section
                    HtmlNode byAuthorsBlock = header.ParentNode.Descendants()
                                              .Where(n => n.GetAttributeValue("class", "").Equals(Constants.SIGNATURE_AUTHORS))
                                              .Single();

                    // Find the summary paragraph
                    HtmlNode excerptBlock = header.ParentNode.Descendants()
                                            .Where(n => n.GetAttributeValue("class", "").Equals(Constants.SIGNATURE_SUMMARY))
                                            .Single();

                    // If any of the above is missing, then ignore the whole block
                    if (articleHref == null || byAuthorsBlock == null || excerptBlock == null)
                    {
                        continue; // just move on
                    }

                    // We're ok so far

                    // Remove the time from the authors block and
                    // leave only their names
                    var timeNode = byAuthorsBlock.Descendants("time").Single();
                    if (timeNode != null)
                    {
                        byAuthorsBlock.RemoveChild(timeNode);
                    }

                    // Get the authors as text
                    string byAuthorsLine = byAuthorsBlock.InnerText
                                           .Trim(new char [] { '\t', '\n', ' ' });

                    if (byAuthorsLine.StartsWith("by "))
                    {
                        byAuthorsLine = byAuthorsLine.Substring(3);
                    }

                    // Extract the article's URL string
                    string articleUrl = articleHref.GetAttributeValue("href", "#");

                    // The summary is simple
                    var summary = excerptBlock.InnerText;

                    // Sanitize it
                    summary = HttpUtility.HtmlDecode(summary.Replace("&nbsp;", " "));

                    // Fill the article header
                    var articleHeader = new ArticleHeader()
                    {
                        title   = articleTitle,
                        url     = new Uri(articleUrl),
                        authors = byAuthorsLine,
                        summary = summary
                    };

                    // Push into the results
                    results.Add(articleHeader);
                }
                catch
                {
                    // Gracefully inform the caller that something went wrong
                    if (onReport != null)
                    {
                        onReport(MessageKind.Warning, String.Format("Failed to parse article #{0}", curArticleIndex));
                    }
                }

                // Move on
                curArticleIndex++;
            }

            // Return what we've got
            return(results);
        }

コード例 #2

0

ファイルを表示

ファイル: ArticleParser.cs プロジェクト: eugene-tt/Code-Samples

        public static void processArticles(object context)
        {
            ArticleParserContext ctx = (context as ArticleParserContext);

            if (ctx == null)
            {
                return; // Shouldn't be here
            }

            ctx.reportAction(MessageKind.Info, String.Format("Worker {0}: Started", ctx.id));

            while (!ctx.shouldStop.IsCancellationRequested)
            {
                // Prepare to acquire a task from the shared queue
                ArticleHeader task = null;

                // Try to pick a task from the queue. For this
                // we need to seriliaze access to the queue among
                // all the workers, so we use a simple critical section
                lock (ctx.articles)
                {
                    // Check if the queue is not empty
                    if (ctx.articles.Count > 0)
                    {
                        // Take a task from it
                        task = ctx.articles.Dequeue();
                    }
                }

                // Now either we acquired a task or there are no more tasks
                if (task == null || ctx.shouldStop.IsCancellationRequested)
                {
                    break; // exit the working loop
                }

                // Report the allocation. Due to the multiline output
                // we will have to lock other workers attempt to print
                // at the same time
                lock (ctx.reportAction)
                {
                    ctx.reportAction(MessageKind.Info, String.Format("[10]Worker {0}: Parsing the article \"{1}\"", ctx.id, task.title));
                }

                // Process the article and measure its local
                // word frequency
                var localFrequency = processArticle(task, ctx.reportAction);

                // Check what's up
                if (localFrequency == null)
                {
                    ctx.reportAction(MessageKind.Warning, "Failed to parse the article");
                    continue;
                }

                // Merge the local frequency into the global one
                mergeStatistics(ctx, localFrequency, task);

                // Report task completed
                ctx.reportAction(MessageKind.Info, String.Format("[10]Worker {0}: Parsed the article", ctx.id));
            }

            bool wasStopped = ctx.shouldStop.IsCancellationRequested;

            ctx.reportAction(wasStopped ? MessageKind.Warning : MessageKind.Info,
                             String.Format("Worker {0}: {1}", ctx.id,
                                           wasStopped ? "Terminated" : "Finished"
                                           ));
        }

コード例 #3

0

ファイルを表示

ファイル: ArticleParser.cs プロジェクト: eugene-tt/Code-Samples

        private static Dictionary <string, int> processArticle(ArticleHeader article, Action <MessageKind, string> doReport)
        {
            var result = new Dictionary <string, int>();

            // Ignore the failures
            try
            {
                // Dowload the article
                var client   = new WebClient();
                var pageData = client.DownloadString(article.url);

                // Parse the HTML and extract the article's main content
                var html = new HtmlDocument();

                // Load the HTML string
                html.LoadHtml(pageData);

                var domRoot = html.DocumentNode;
                var content = domRoot.Descendants()
                              .Where(n => n.GetAttributeValue("class", "").Contains(Constants.SIGNATURE_CONTENT))
                              .Single();

                var textOnly = content.Descendants("p");

                // Combine the text from paragraphs
                StringBuilder sb = new StringBuilder(textOnly.Count());
                foreach (var node in textOnly)
                {
                    sb.Append(node.InnerText.ToLowerInvariant());
                    sb.Append(" ");
                }

                // Normalize the text
                var articleText = sb.ToString();
                articleText = HttpUtility.HtmlDecode(articleText.Replace("&nbsp;", " "));

                // Split the text into words
                string[] articleWords = articleText.Split(' ');

                // Update the local stats
                foreach (var word in articleWords)
                {
                    if (word == null || word.Length == 0)
                    {
                        continue;
                    }

                    if (!result.ContainsKey(word))
                    {
                        result[word] = 1;
                    }
                    else
                    {
                        result[word] = result[word] + 1;
                    }
                }
            }
            catch
            {
                return(null);
            }

            return(result);
        }

コード例 #4

0

ファイルを表示

ファイル: ArticleParser.cs プロジェクト: eugene-tt/Code-Samples

        private static void mergeStatistics(ArticleParserContext ctx, Dictionary <string, int> localStats, ArticleHeader header)
        {
            // First of all lock the global stats to
            // avoid collisions caused by the workers
            // running in parallel
            lock (ctx.globalStats)
            {
                ctx.reportAction(MessageKind.Info, String.Format("[10]Worker {0}: Merging stats...", ctx.id));

                // Run through the worker's local stas and merge it
                foreach (var wordInfo in localStats)
                {
                    string word = wordInfo.Key;

                    // Make sure the word exists
                    if (!ctx.globalStats.ContainsKey(word))
                    {
                        ctx.globalStats[word] = new WordStatsInfo();
                    }

                    // Add the local counter to the global one
                    ctx.globalStats[word].count = ctx.globalStats[word].count + wordInfo.Value;

                    // Add the article's reference
                    ctx.globalStats[word].articles.Add(header.url.ToString());
                }
            }
            ctx.reportAction(MessageKind.Info, String.Format("[10]Worker {0}: Stats merged", ctx.id));
        }

C# (CSharp) Crawler.Code ArticleHeaderの例