static void Main(string[] args) { DatabaseConnection dbCon = new DatabaseConnection(); dbCon.SetConnectionString(DatabaseType.SqlServer2005); dbCon.Username = "******"; dbCon.Password = "******"; dbCon.Server = "\\SQLEXPRESS"; dbCon.Database = "SloWebCorpus"; dbCon.Connect(); // ... dbCon.Disconnect(); //Console.WriteLine(WebUtils.NormalizeQuery("ata +fabo + - - -\"maMma mia\" -dec\"\" -\"\"")); Console.WriteLine("\"{0}\"", WebUtils.NormalizeQuery("")); Test(); return; YahooSearchEngine searchEngine = new YahooSearchEngine("internet"); searchEngine.Language = Language.French; searchEngine.ResultSetMaxSize = 300; searchEngine.Search(); int c = 0; foreach (SearchEngineResultItem item in searchEngine.ResultSet) { Console.WriteLine("{0}. {1}\r\n{2}\r\n", ++c, item.Title, item.Snippet); } }
public UrlTreeBoilerplateRemoverComponent(string dbConnectionString) : base(typeof(UrlTreeBoilerplateRemoverComponent)) { mDbConnection = new DatabaseConnection(); mDbConnection.ConnectionString = dbConnectionString; mDbConnection.Connect(); mBlockSelector = "TextBlock"; }
static void Main(string[] args) { WebUtils.DefaultTimeout = 10000; WebUtils.NumRetries = 3; WebUtils.WaitBetweenRetries = 5000; WebUtils.ReadWriteTimeout = 30000; ServicePointManager.DefaultConnectionLimit = 8; DateTime startTime = DateTime.MinValue; Logger rootLogger = Logger.GetRootLogger(); rootLogger.LocalLevel = Logger.Level.Debug; string LOG_FILE_NAME = ConfigurationManager.AppSettings["logFileName"]; rootLogger.LocalOutputType = Logger.OutputType.Console; if (LOG_FILE_NAME != null) { rootLogger.LocalOutputWriter = new StreamWriter(LOG_FILE_NAME, /*append=*/true); rootLogger.LocalOutputType |= Logger.OutputType.Writer; } string SOURCES_FILE_NAME = ConfigurationManager.AppSettings["dataSourcesFileName"]; string WEB_SITE_ID = Utils.GetConfigValue("webSiteId", "dacq"); string DB_CONNECTION_STRING = ConfigurationManager.AppSettings["dbConnectionString"]; string SQL_DB_CONNECTION_STRING = ConfigurationManager.AppSettings["SqlDbConnectionString"]; string DB_CONNECTION_STRING_DUMP = ConfigurationManager.AppSettings["dbConnectionStringDump"]; string CLIENT_IP = ConfigurationManager.AppSettings["clientIp"]; string XML_DATA_ROOT = ConfigurationManager.AppSettings["xmlDataRoot"]; string XML_DATA_ROOT_DUMP = ConfigurationManager.AppSettings["xmlDataRootDump"]; string HTML_DATA_ROOT = ConfigurationManager.AppSettings["htmlDataRoot"]; string HTML_DATA_ROOT_DUMP = ConfigurationManager.AppSettings["htmlDataRootDump"]; string XML_DATA_ROOT_NEW = XML_DATA_ROOT == null ? null : (XML_DATA_ROOT.TrimEnd('\\') + "\\" + "New"); string HTML_DATA_ROOT_NEW = HTML_DATA_ROOT == null ? null : (HTML_DATA_ROOT.TrimEnd('\\') + "\\" + "New"); string XML_DATA_ROOT_DUMP_NEW = XML_DATA_ROOT_DUMP == null ? null : (XML_DATA_ROOT_DUMP.TrimEnd('\\') + "\\" + "New"); string HTML_DATA_ROOT_DUMP_NEW = HTML_DATA_ROOT_DUMP == null ? null : (HTML_DATA_ROOT_DUMP.TrimEnd('\\') + "\\" + "New"); string DB_CONNECTION_STRING_NEW = ConfigurationManager.AppSettings["SqlDbConnectionStringNew"]; string tmp = ConfigurationManager.AppSettings["enableZeroMQ"]; bool ENABLE_ZEROMQ = tmp != null && new List<string>(new string[] { "true", "1", "yes", "on" }).Contains(tmp.ToLower()); const int NUM_WRITERS = 8; const int SLEEP_BETWEEN_POLLS = 15 * 60000; // 15 minutes ArrayList<StreamDataProducerPoll> dataReaders = new ArrayList<StreamDataProducerPoll>(); ArrayList<StreamDataConsumer> dataConsumers = new ArrayList<StreamDataConsumer>(); Dictionary<IWorkflowComponent, Guid> components = new Dictionary<IWorkflowComponent, Guid>(); // init logging Logger logger = Logger.GetLogger("Latino.Workflows.Dacq"); // start HTTP server bool exit = false; bool httpServerRunning = false; if (CLIENT_IP != null) { new Thread(new ThreadStart(delegate() { HttpListener listener = new HttpListener(); listener.AuthenticationSchemes = AuthenticationSchemes.Anonymous; listener.Prefixes.Add(string.Format("http://localhost/{0}/", WEB_SITE_ID)); listener.Prefixes.Add(string.Format("http://first.ijs.si/{0}/", WEB_SITE_ID)); listener.Start(); logger.Info("Main.HttpServer", "HTTP server started."); httpServerRunning = true; DateTime prevRequestTime = DateTime.MinValue; while (!exit) { try { HttpListenerContext ctx = null; listener.BeginGetContext(new AsyncCallback(delegate(IAsyncResult ar) { try { ctx = listener.EndGetContext(ar); } catch { } }), /*state=*/null); while (!exit && ctx == null) { Thread.Sleep(500); } if (!exit) { // process requests one by one ctx.Response.AppendHeader("Content-Type", "application/xml"); XmlWriterSettings settings = new XmlWriterSettings(); settings.Encoding = Encoding.UTF8; settings.CheckCharacters = false; settings.Indent = true; XmlWriter w = XmlTextWriter.Create(ctx.Response.OutputStream, settings); w.WriteStartElement("DacqResponse"); w.WriteElementString("DacqStartTime", startTime.ToString(TIME_FORMAT)); if (prevRequestTime == DateTime.MinValue) { prevRequestTime = startTime; } DateTime thisRequestTime = DateTime.Now; string command = GetHttpRequestCommand(ctx.Request.Url.ToString()); if (command == "components") { w.WriteElementString("PreviousRequestTime", prevRequestTime.ToString(TIME_FORMAT)); w.WriteElementString("ThisRequestTime", thisRequestTime.ToString(TIME_FORMAT)); } w.WriteElementString("Request", ctx.Request.Url.ToString()); w.WriteElementString("Command", command); w.WriteStartElement("ResponseBody"); if (command == "help") { WriteSupportedCommands(w); } else if (command == "components") { WriteComponentInfo(w, components, thisRequestTime, prevRequestTime); prevRequestTime = thisRequestTime; } else if (command == "sources") { WriteRssInfo(w, components); } w.WriteEndElement(); // ResponseBody w.WriteEndElement(); // DacqResponse w.Close(); ctx.Response.Close(); } } catch (Exception e) { logger.Warn("Main.HttpServer", e); } } listener.Stop(); logger.Info("Main.HttpServer", "HTTP server stopped."); httpServerRunning = false; })).Start(); } Console.CancelKeyPress += delegate(object sender, ConsoleCancelEventArgs e) { logger.Info("Main", "*** Ctrl-C command received. ***"); e.Cancel = true; exit = true; string componentsStr = ""; foreach (StreamDataProducerPoll c in dataReaders) { if (c.IsRunning) { componentsStr += "\r\n" + c.GetType() + " : " + c.Name; } } foreach (StreamDataConsumer dataConsumer in dataConsumers) { if (dataConsumer.IsRunning) { componentsStr += "\r\n" + dataConsumer.GetType() + " : " + dataConsumer.Load; } } logger.Info("Main", "Active components:" + componentsStr); }; logger.Info("Main", "Starting Dacq ..."); // initialize database writers PassOnComponent lb = new PassOnComponent(); // data load balancer lb.DispatchPolicy = DispatchPolicy.BalanceLoadMax; dataConsumers.Add(lb); ZeroMqEmitterComponent zmq = null; if (ENABLE_ZEROMQ) { zmq = new ZeroMqEmitterComponent(); dataConsumers.Add(zmq); } DatabaseConnection dbConnection = new DatabaseConnection(); if (DB_CONNECTION_STRING != null) { dbConnection.ConnectionString = DB_CONNECTION_STRING; dbConnection.Connect(); UrlTreeBoilerplateRemoverComponent.InitializeHistory(dbConnection); dbConnection.Disconnect(); RssFeedComponent.DatabaseConnectionString = SQL_DB_CONNECTION_STRING; } for (int i = 0; i < NUM_WRITERS; i++) { DocumentCorpusWriterComponent dcw = new DocumentCorpusWriterComponent(DB_CONNECTION_STRING_DUMP, /*xmlDataRoot=*/null); DocumentWriterComponent dwc = new DocumentWriterComponent(/*connectionString=*/null, XML_DATA_ROOT_DUMP_NEW, HTML_DATA_ROOT_DUMP_NEW); dcw.IsDumpWriter = true; UrlTreeBoilerplateRemoverComponent bpr = new UrlTreeBoilerplateRemoverComponent(DB_CONNECTION_STRING); DocumentCorpusWriterComponent cw = new DocumentCorpusWriterComponent(DB_CONNECTION_STRING, XML_DATA_ROOT); DocumentWriterComponent dw = new DocumentWriterComponent(DB_CONNECTION_STRING_NEW, XML_DATA_ROOT_NEW, HTML_DATA_ROOT_NEW); HtmlTokenizerComponent htc = new HtmlTokenizerComponent(); SentenceSplitterComponent ssc = new SentenceSplitterComponent(); EnglishTokenizerComponent tok = new EnglishTokenizerComponent(); EnglishLemmatizerComponent lem = new EnglishLemmatizerComponent(EnglishLemmatizerComponent.Type.Both); EnglishPosTaggerComponent pt = new EnglishPosTaggerComponent(); LanguageDetectorComponent ld = new LanguageDetectorComponent(); DocumentFilterComponent df = new DocumentFilterComponent(); df.OnFilterDocument += new DocumentFilterComponent.FilterDocumentHandler(delegate(Document document, Logger dfLogger) { string docId = document.Features.GetFeatureValue("guid"); // remove items without link in RSS document if (document.Features.GetFeatureValue("contentType") != "Text") { dfLogger.Info("OnFilterDocument", "Document rejected: contentType not Text (id={0}).", docId); return false; } // remove items with blacklisted URL if (document.Features.GetFeatureValue("blacklisted") == "True") { dfLogger.Info("OnFilterDocument", "Document rejected: responseUrl blacklisted (id={0}).", docId); return false; } // remove items with not enough content if (Convert.ToInt32(document.Features.GetFeatureValue("bprContentCharCount")) < 100) { dfLogger.Info("OnFilterDocument", "Document rejected: bprContentCharCount < 100 (id={0}).", docId); return false; } // remove non-English items if (document.Features.GetFeatureValue("detectedCharRange") != "Basic Latin") { dfLogger.Info("OnFilterDocument", "Document rejected: detectedCharRange not Basic Latin (id={0}).", docId); return false; } if (document.Features.GetFeatureValue("detectedLanguage") != "English") { dfLogger.Info("OnFilterDocument", "Document rejected: detectedLanguage not English (id={0}).", docId); return false; } // remove exact duplicates if (document.Features.GetFeatureValue("unseenContent") == "No") { dfLogger.Info("OnFilterDocument", "Document rejected: no unseen content (id={0}).", docId); return false; } return true; }); ld.BlockSelector = "TextBlock/Content"; // due to problems with itar-tass.com df.SubscribeDumpConsumer(dcw); df.SubscribeDumpConsumer(dwc); lb.Subscribe(htc); htc.Subscribe(bpr); bpr.Subscribe(ld); ld.Subscribe(df); df.Subscribe(ssc); ssc.Subscribe(tok); tok.Subscribe(lem); lem.Subscribe(pt); pt.Subscribe(cw); pt.Subscribe(dw); if (ENABLE_ZEROMQ) { pt.Subscribe(zmq); } dataConsumers.AddRange(new StreamDataConsumer[] { dcw, dwc, df, ld, htc, ssc, tok, pt, cw, dw, lem, bpr }); } // initialize stream simulator string offlineSource = ConfigurationManager.AppSettings["offlineSource"]; if (offlineSource != null) { DocumentStreamReaderComponent dsr = new DocumentStreamReaderComponent(offlineSource); dsr.Name = offlineSource; dataReaders.Add(dsr); dsr.Subscribe(lb); } // initialize RSS feed components int j = 0; RssFeedComponent rssComp = null; Set<string> sites = new Set<string>(); if (SOURCES_FILE_NAME != null) { string[] sources = File.ReadAllLines(SOURCES_FILE_NAME); foreach (string _url in sources) { string url = _url.Trim(); if (url != "" && !url.StartsWith("#")) { Match m; if ((m = Regex.Match(url, @"^site\s*:(?<siteId>.*)$", RegexOptions.IgnoreCase)).Success) { string siteId = m.Result("${siteId}").Trim().ToLower(); if (sites.Contains(siteId)) { throw new Exception(string.Format("Duplicated site identifier ({0}).", siteId)); } sites.Add(siteId); rssComp = new RssFeedComponent(siteId); rssComp.MaxDocsPerCorpus = Convert.ToInt32(Utils.GetConfigValue("MaxDocsPerCorpus", "50")); rssComp.RandomDelayAtStart = new ArrayList<string>("yes,on,true,1,y".Split(',')) .Contains(Utils.GetConfigValue("RandomDelayAtStart", "true").ToLower()); rssComp.Name = siteId; rssComp.TimeBetweenPolls = SLEEP_BETWEEN_POLLS; rssComp.IncludeRssXml = true; if (SQL_DB_CONNECTION_STRING != null) { rssComp.Initialize(SQL_DB_CONNECTION_STRING); } rssComp.IncludeRawData = true; rssComp.Subscribe(lb); dataReaders.Add(rssComp); j++; } else if (rssComp != null) { rssComp.AddSource(url); } } } } foreach (StreamDataProducerPoll c in dataReaders) { c.Start(); } foreach (IWorkflowComponent obj in dataReaders) { components.Add(obj, Guid.NewGuid()); } foreach (IWorkflowComponent obj in dataConsumers) { components.Add(obj, Guid.NewGuid()); } startTime = DateTime.Now; while (!exit) { Thread.Sleep(500); } // shut down gracefully logger.Info("Main", "Please wait while shutting down ..."); // wait for HTTP server shutdown while (httpServerRunning) { Thread.Sleep(500); } // stop RSS components foreach (StreamDataProducerPoll c in dataReaders) { c.Stop(); } foreach (StreamDataProducerPoll c in dataReaders) { while (c.IsRunning) { Thread.Sleep(500); } } // wait for all data consumers to finish foreach (StreamDataConsumer dataConsumer in dataConsumers) { if (dataConsumer.IsRunning) { while (!dataConsumer.IsSuspended) { Thread.Sleep(500); } } dataConsumer.Dispose(); } logger.Info("Main", "Dacq successfully stopped."); }
public static void InitializeHistory(DatabaseConnection dbConnection) { Logger logger = Logger.GetLogger(typeof(UrlTreeBoilerplateRemoverComponent)); logger.Info("InitializeHistory", "Loading history ..."); mUrlInfo.Clear(); mTextBlockInfo.Clear(); DataTable domainsTbl = dbConnection.ExecuteQuery(string.Format(@" SELECT DISTINCT domain FROM (SELECT * FROM (SELECT TOP {0} domain FROM Documents WHERE domain IS NOT NULL GROUP BY domain ORDER BY MAX(time) DESC) x UNION SELECT * FROM (SELECT TOP {0} domain FROM Documents WHERE domain IS NOT NULL GROUP BY domain ORDER BY COUNT(*) DESC) y) z", 3000/*make this configurable!!*/)); int domainCount = 0; foreach (DataRow row in domainsTbl.Rows) { string domainName = (string)row["domain"]; DataTable urlInfoTbl = dbConnection.ExecuteQuery(string.Format(@" SELECT TOP {0} d.id, d.corpusId, d.time, d.responseUrl, d.urlKey, d.rev, d.domain, (SELECT TOP 1 dd.rev from Documents dd WHERE dd.urlKey = d.urlKey ORDER BY dd.time DESC, dd.rev DESC) AS maxRev, tb.hashCodes FROM Documents d INNER JOIN TextBlocks tb ON d.corpusId = tb.corpusId AND d.id = tb.docId WHERE d.domain = ? ORDER BY d.time DESC", mMaxQueueSize), domainName); if (urlInfoTbl.Rows.Count == 0) { continue; } Pair<UrlTree, Queue<TextBlockHistoryEntry>> textBlockInfo = GetTextBlockInfo(domainName); DateTime then = DateTime.Parse((string)urlInfoTbl.Rows[0]["time"]) - new TimeSpan(mHistoryAgeDays, 0, 0, 0); domainCount++; Console.WriteLine("* " + domainName + string.Format(" ({0}/{1})", domainCount, domainsTbl.Rows.Count)); Pair<Dictionary<string, Ref<int>>, Queue<UrlHistoryEntry>> urlInfo = GetUrlInfo(domainName); for (int j = urlInfoTbl.Rows.Count - 1; j >= 0; j--) { int rev = (int)urlInfoTbl.Rows[j]["rev"]; int maxRev = (int)urlInfoTbl.Rows[j]["maxRev"]; string urlKey = (string)urlInfoTbl.Rows[j]["urlKey"]; string timeStr = (string)urlInfoTbl.Rows[j]["time"]; Guid corpusId = new Guid((string)urlInfoTbl.Rows[j]["corpusId"]); Guid docId = new Guid((string)urlInfoTbl.Rows[j]["id"]); DateTime time = DateTime.Parse(timeStr); if (time >= then) { // URL cache if (rev == 1) { if (urlInfo.First.ContainsKey(urlKey)) { Remove(urlKey, urlInfo); } //Console.WriteLine(maxRev); urlInfo.First.Add(urlKey, new Ref<int>(maxRev)); urlInfo.Second.Enqueue(new UrlHistoryEntry(urlKey, time)); } else { urlInfo.Second.Enqueue(new UrlHistoryEntry(/*urlKey=*/null, time)); // dummy entry into the URL queue (to ensure sync with the text blocks queue) } // URL tree string hashCodesBase64 = (string)urlInfoTbl.Rows[j]["hashCodes"]; string responseUrl = (string)urlInfoTbl.Rows[j]["responseUrl"]; byte[] buffer = Convert.FromBase64String(hashCodesBase64); BinarySerializer memSer = new BinarySerializer(new MemoryStream(buffer)); ArrayList<ulong> hashCodes = new ArrayList<ulong>(memSer); bool fullPath = urlKey.Contains("?"); TextBlockHistoryEntry entry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/rev == 1); textBlockInfo.First.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/true, /*incDocCount=*/rev == 1); textBlockInfo.Second.Enqueue(entry); } } } logger.Info("InitializeHistory", "Loaded history for {0} distinct domains.", domainCount); }