Exemplo n.º 1
0
        static void Main(string[] args)
        {
            // TeraCrawler.exe [Game] [Target Site] [Category ID]
            if (args.Length != 4)
            {
                Logger.Log("Invalid execution parameters.");
                Logger.Log("TeraCrawler.exe [Game] [Target Site] [Category ID] [mode]");
                Logger.Log("[Game] must be one of ({0})", string.Join(", ", Enum.GetValues(typeof (Games)).OfType<Games>()));
                Logger.Log("[Target Site] must be one of ({0})", string.Join(", ", Enum.GetValues(typeof (TargetSites)).OfType<TargetSites>()));
                Logger.Log("[mode] must be one of (normal, recovery)");

                return;
            }

            var game = (Games)Enum.Parse(typeof (Games), args[0]);
            var targetSite = (TargetSites)Enum.Parse(typeof (TargetSites), args[1]);
            var categoryId = int.Parse(args[2]);
            var mode = args[3];

            var articleCount = 0;
            var sameCountCounter = 0;
            var crawler = Crawler.Get(targetSite, categoryId, mode);
            while(true)
            {
                try
                {
                    if (mode == "normal" && sameCountCounter == 10)
                    {
                        sameCountCounter = 0;
                        crawler.Reset();

                        Logger.Log("Reset crawler!");
                    }

                    crawler.CollectArticleList();
                    crawler.CrawlArticles();
                }
                catch (Exception ex)
                {
                    Logger.Log(ex);
                }

                Thread.Sleep(60 * 1000);

                using (var context = new TeraArticleDataContext())
                {
                    var count = context.Articles.Count();
                    if (articleCount == count) sameCountCounter++;
                    articleCount = count;
                }
            }
        }
Exemplo n.º 2
0
        private static void Log(LogType logType, string message)
        {
            if (message == null)
            {
                Console.WriteLine("[Warning] Log message is null");
                return;
            }

            var timeStamp = DateTime.Now;
            var formattedMessage = string.Format("[{0}] {1}", timeStamp.ToString("HH:mm:ss"), message);
            Console.WriteLine(formattedMessage);

            using (var context = new TeraArticleDataContext())
            {
                context.Logs.InsertOnSubmit(new Log
                {
                    TimeStamp = timeStamp,
                    LogType = (int)logType,
                    Message = message,
                });
                context.SubmitChanges();
            }
        }
Exemplo n.º 3
0
        static void Main(string[] args)
        {
            var lastWorkArticleId = 0;
            var contentFilePath = @"";
            while (true)
            {
                try
                {
                    using (var context = new TeraArticleDataContext())
                    {
                        #region initializing data
                        if (!context.CheckPoints.Any(e => e.AnalysisPhase == (int)AnalysisPhase.Preprocess))
                        {
                            context.CheckPoints.InsertOnSubmit(new CheckPoint
                            {
                                AnalysisPhase = (int)AnalysisPhase.Preprocess,
                                ProcessedArticleId = 0,
                            });
                            context.SubmitChanges();
                        }
                        #endregion

                        #region get an article to work
                        var processedInfo = context.CheckPoints.Where(e => e.AnalysisPhase == (int)AnalysisPhase.Preprocess).First();

                        // no article - Sleep thread
                        if (!context.Articles.Any(e => e.ArticleAutoId > processedInfo.ProcessedArticleId))
                        {
                            Logger.Log("All documents are preprocessed!");
                            Thread.Sleep(60 * 1000);
                            continue;
                        }

                        // fetch article
                        var article = context.Articles.Where(e => e.ArticleAutoId > processedInfo.ProcessedArticleId).OrderBy(e => e.ArticleAutoId).First();
                        lastWorkArticleId = article.ArticleAutoId;

                        if (article.ContentHtml == null)
                        {
                            processedInfo.ProcessedArticleId = article.ArticleAutoId;
                            context.SubmitChanges();
                            continue;
                        }

                        #endregion

                        #region clean up text and make XML Document

                        var document = new HtmlDocument();
                        document.LoadHtml(article.ContentHtml.Replace("<br>", "\n"));
                        var htmlCleanText = WebUtility.HtmlDecode(document.DocumentNode.InnerText).Trim();
                        htmlCleanText = ReplaceHexadecimalSymbols(htmlCleanText);

                        contentFilePath = string.Format(@"C:\mecab\{0}.in", article.ArticleAutoId);
                        File.WriteAllText(contentFilePath, htmlCleanText);

                        var p = new Process();
                        p.StartInfo.RedirectStandardError = true;
                        p.StartInfo.RedirectStandardOutput = true;
                        p.StartInfo.UseShellExecute = false;
                        p.StartInfo.CreateNoWindow = true;
                        p.StartInfo.FileName = @"C:\mecab\mecab.exe";
                        p.StartInfo.Arguments = @"-r C:\mecab\dic\dicrc -d C:\mecab\dic " + contentFilePath;
                        p.StartInfo.RedirectStandardInput = true;
                        p.StartInfo.StandardOutputEncoding = Encoding.UTF8;
                        p.Start();

                        var xDoc = new XDocument();
                        xDoc.Add(new XElement("Document"));
                        xDoc.Root.Add(new XElement("HtmlCleanDocument", htmlCleanText));

                        var result = p.StandardOutput.ReadToEnd();
                        foreach (var sentence in result.Split(new List<string> { "EOS" }.ToArray(), StringSplitOptions.RemoveEmptyEntries))
                        {
                            var morphemeList = sentence
                                .Split('\n')
                                .Select(e => e.Split('\t', ','))
                                .Where(e => e.Length > 8)
                                .Select(e => new
                                {
                                    Token = e[0].Trim(),
                                    Tag = e[1].Trim(),
                                    CombinedResult = e[7].Trim(),
                                })
                                .Where(e => (e.Tag.Contains("NN") || e.Tag.Contains("VV") || e.Tag.Contains("VA")))
                                .ToList();

                            if (morphemeList.Count == 0) continue;

                            var preprocessedMorphemeList = new List<string>();
                            foreach (var morpheme in morphemeList)
                            {
                                if (morpheme.CombinedResult == "*")
                                {
                                    preprocessedMorphemeList.Add(morpheme.Token + "/" + morpheme.Tag);
                                }
                                else
                                {
                                    preprocessedMorphemeList.AddRange(morpheme.CombinedResult.Split('+').Where(e => (e.Contains("NN") || e.Contains("VV") || e.Contains("VA"))));
                                }
                            }
                            xDoc.Root.Add(new XElement("Sentence", string.Join(",", preprocessedMorphemeList)));
                        }
                        File.Delete(contentFilePath);

                        #endregion

                        #region Update to database

                        article.Keywords = xDoc.ToString();
                        processedInfo.ProcessedArticleId = article.ArticleAutoId;

                        context.SubmitChanges();

                        #endregion
                    }
                }
                catch (Exception ex)
                {
                    Logger.Log(ex);
                    if (File.Exists(contentFilePath)) File.Delete(contentFilePath);
                }
                finally
                {
                }

            } // end - while(true)
        }