public void Parse(string href = null, CancellationToken?parserToken = null) { var tokenSource = new CancellationTokenSource(); // A thread to save ready words to database // We need a foreground thread here (not a task), if not when the main thread finishes it will terminate the thread before saving data completely var thread = new Thread(() => { while (!tokenSource.Token.IsCancellationRequested) { var words = new List <Word>(); for (int i = 0; i < NumberOfConcurrentSavingWords; i++) { if (!wordQueue.TryDequeue(out Word word)) { break; } words.Add(word); } // Save many words at the same time to improve performance if (words.Count > 0) { SaveWords(words); } } }); thread.Start(); if (href == null || href.EndsWith("/special:allpages", StringComparison.OrdinalIgnoreCase)) { // If there is a saved file, the process will start over from the state in this file // So if we want to process from the begining, the saved file has to be deleted resumeState = CancellationUtil.Restore(); var url = "http://tratu.soha.vn/dict/en_vn/special:allpages"; logger.Log(GetType(), Level.Info, $"Processing all pages at {url}", null); var htmlWeb = new HtmlWeb(); var htmlDoc = htmlWeb.Load(url); foreach (var link in htmlDoc.DocumentNode.SelectNodes("//table[@class='allpageslist']/tr/td[1]/a[@href]")) { var pageUrl = link.Attributes["href"].Value; if (resumeState != null && !isReachedCancellationPage) { // Reach the stopped page, turn the flag to on so that all the following pages will be proccessed if (pageUrl == resumeState.PageUrl) { isReachedCancellationPage = true; } else { // Ignore pages before saved page logger.Log(GetType(), Level.Info, $"Ignored page {pageUrl}", null); continue; } } // Parser is requested to stop if (parserToken?.IsCancellationRequested ?? false) { // Just need to save the state if it's canceled at a later point if (resumeState == null || isReachedCancellationPage) { cancellationState = new CancellationState() { PageUrl = pageUrl }; logger.Log(GetType(), Level.Info, $"Process is being cancelled at {cancellationState.PageUrl}", null); } break; } ParsePage(pageUrl, parserToken); // Parser is requested to stop if (parserToken?.IsCancellationRequested ?? false) { break; } } } else if (href.Contains("%C4%90%E1%BA%B7c_bi%E1%BB%87t:Allpages/", StringComparison.OrdinalIgnoreCase)) { ParsePage(href); } else { ParseWord(href); } // All words has been processed while (wordQueue.Count > 0) { } // Signal to end the register thread tokenSource.Cancel(); // Wait for register thread to end thread.Join(); // Only save the state if there is a cancellation if (cancellationState != null) { CancellationUtil.Save(cancellationState); } logger.Log(GetType(), Level.Info, $"{totalWordCount} words were read from {totalPageCount} pages, {successWordCount} words were registered successfuly to database", null); }
public static void Save(CancellationState state) { File.WriteAllText(CancellationStateFile, JsonSerializer.Serialize(state)); }
public void ParsePage(string href, CancellationToken?parserToken = null) { var url = CreateFullUrl(href); logger.Log(GetType(), Level.Info, $"Processing a page at {url}", null); var htmlDoc = LoadPageWithTimeout(url, 1); if (htmlDoc == null) { htmlDoc = LoadPageWithTimeout(url, 1); } if (htmlDoc == null) { logger.Log(GetType(), Level.Error, $"Loading page {url} timed out", null); } IEnumerable <HtmlNode> wordNodes = htmlDoc.DocumentNode.SelectNodes("//div[@id='bodyContent']/table[2]//td/a"); do { var firstWordUrl = wordNodes.First().Attributes["href"].Value; if (resumeState?.WordUrl != null && !isReachedCancellationWord) { if (firstWordUrl == resumeState.WordUrl) { isReachedCancellationWord = true; } else { logger.Log(GetType(), Level.Info, $"Ignored a block of {NumberOfConcurrentProcessingWords} words starting at {firstWordUrl}", null); wordNodes = wordNodes.Skip(NumberOfConcurrentProcessingWords); continue; } } // Parser is requested to stop if (parserToken?.IsCancellationRequested ?? false) { if (resumeState?.WordUrl == null || isReachedCancellationWord) { cancellationState = new CancellationState() { PageUrl = href, WordUrl = wordNodes.First().Attributes["href"].Value }; logger.Log(GetType(), Level.Info, $"Process is being cancelled at {cancellationState.PageUrl}, {cancellationState.WordUrl}", null); } break; } var consecutiveNodes = wordNodes.Take(NumberOfConcurrentProcessingWords); // Load a block of words at a time and wait for that to complete before moving to another block var parserTasks = new List <Task>(); foreach (var w in consecutiveNodes) { parserTasks.Add(Task.Run(() => { var word = ReadWord(w.Attributes["href"].Value); if (word != null) { wordQueue.Enqueue(word); } })); } Task.WaitAll(parserTasks.ToArray()); wordNodes = wordNodes.Skip(NumberOfConcurrentProcessingWords); } while (wordNodes.Count() > 0); }