Exemple #1
0
        public void Parse(string href = null, CancellationToken?parserToken = null)
        {
            var tokenSource = new CancellationTokenSource();

            // A thread to save ready words to database
            // We need a foreground thread here (not a task), if not when the main thread finishes it will terminate the thread before saving data completely
            var thread = new Thread(() =>
            {
                while (!tokenSource.Token.IsCancellationRequested)
                {
                    var words = new List <Word>();
                    for (int i = 0; i < NumberOfConcurrentSavingWords; i++)
                    {
                        if (!wordQueue.TryDequeue(out Word word))
                        {
                            break;
                        }

                        words.Add(word);
                    }

                    // Save many words at the same time to improve performance
                    if (words.Count > 0)
                    {
                        SaveWords(words);
                    }
                }
            });

            thread.Start();


            if (href == null || href.EndsWith("/special:allpages", StringComparison.OrdinalIgnoreCase))
            {
                // If there is a saved file, the process will start over from the state in this file
                // So if we want to process from the begining, the saved file has to be deleted
                resumeState = CancellationUtil.Restore();

                var url = "http://tratu.soha.vn/dict/en_vn/special:allpages";
                logger.Log(GetType(), Level.Info, $"Processing all pages at {url}", null);

                var htmlWeb = new HtmlWeb();
                var htmlDoc = htmlWeb.Load(url);

                foreach (var link in htmlDoc.DocumentNode.SelectNodes("//table[@class='allpageslist']/tr/td[1]/a[@href]"))
                {
                    var pageUrl = link.Attributes["href"].Value;
                    if (resumeState != null && !isReachedCancellationPage)
                    {
                        // Reach the stopped page, turn the flag to on so that all the following pages will be proccessed
                        if (pageUrl == resumeState.PageUrl)
                        {
                            isReachedCancellationPage = true;
                        }
                        else
                        {
                            // Ignore pages before saved page
                            logger.Log(GetType(), Level.Info, $"Ignored page {pageUrl}", null);
                            continue;
                        }
                    }

                    // Parser is requested to stop
                    if (parserToken?.IsCancellationRequested ?? false)
                    {
                        // Just need to save the state if it's canceled at a later point
                        if (resumeState == null || isReachedCancellationPage)
                        {
                            cancellationState = new CancellationState()
                            {
                                PageUrl = pageUrl
                            };
                            logger.Log(GetType(), Level.Info, $"Process is being cancelled at {cancellationState.PageUrl}", null);
                        }
                        break;
                    }

                    ParsePage(pageUrl, parserToken);

                    // Parser is requested to stop
                    if (parserToken?.IsCancellationRequested ?? false)
                    {
                        break;
                    }
                }
            }
            else if (href.Contains("%C4%90%E1%BA%B7c_bi%E1%BB%87t:Allpages/", StringComparison.OrdinalIgnoreCase))
            {
                ParsePage(href);
            }
            else
            {
                ParseWord(href);
            }

            // All words has been processed
            while (wordQueue.Count > 0)
            {
            }

            // Signal to end the register thread
            tokenSource.Cancel();

            // Wait for register thread to end
            thread.Join();

            // Only save the state if there is a cancellation
            if (cancellationState != null)
            {
                CancellationUtil.Save(cancellationState);
            }

            logger.Log(GetType(), Level.Info, $"{totalWordCount} words were read from {totalPageCount} pages, {successWordCount} words were registered successfuly to database", null);
        }
Exemple #2
0
 public static void Save(CancellationState state)
 {
     File.WriteAllText(CancellationStateFile, JsonSerializer.Serialize(state));
 }
Exemple #3
0
        public void ParsePage(string href, CancellationToken?parserToken = null)
        {
            var url = CreateFullUrl(href);

            logger.Log(GetType(), Level.Info, $"Processing a page at {url}", null);

            var htmlDoc = LoadPageWithTimeout(url, 1);

            if (htmlDoc == null)
            {
                htmlDoc = LoadPageWithTimeout(url, 1);
            }
            if (htmlDoc == null)
            {
                logger.Log(GetType(), Level.Error, $"Loading page {url} timed out", null);
            }

            IEnumerable <HtmlNode> wordNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id='bodyContent']/table[2]//td/a");

            do
            {
                var firstWordUrl = wordNodes.First().Attributes["href"].Value;
                if (resumeState?.WordUrl != null && !isReachedCancellationWord)
                {
                    if (firstWordUrl == resumeState.WordUrl)
                    {
                        isReachedCancellationWord = true;
                    }
                    else
                    {
                        logger.Log(GetType(), Level.Info, $"Ignored a block of {NumberOfConcurrentProcessingWords} words starting at {firstWordUrl}", null);
                        wordNodes = wordNodes.Skip(NumberOfConcurrentProcessingWords);
                        continue;
                    }
                }

                // Parser is requested to stop
                if (parserToken?.IsCancellationRequested ?? false)
                {
                    if (resumeState?.WordUrl == null || isReachedCancellationWord)
                    {
                        cancellationState = new CancellationState()
                        {
                            PageUrl = href,
                            WordUrl = wordNodes.First().Attributes["href"].Value
                        };
                        logger.Log(GetType(), Level.Info, $"Process is being cancelled at {cancellationState.PageUrl}, {cancellationState.WordUrl}", null);
                    }

                    break;
                }

                var consecutiveNodes = wordNodes.Take(NumberOfConcurrentProcessingWords);

                // Load a block of words at a time and wait for that to complete before moving to another block
                var parserTasks = new List <Task>();
                foreach (var w in consecutiveNodes)
                {
                    parserTasks.Add(Task.Run(() =>
                    {
                        var word = ReadWord(w.Attributes["href"].Value);
                        if (word != null)
                        {
                            wordQueue.Enqueue(word);
                        }
                    }));
                }
                Task.WaitAll(parserTasks.ToArray());

                wordNodes = wordNodes.Skip(NumberOfConcurrentProcessingWords);
            } while (wordNodes.Count() > 0);
        }