private static void EngWords() { var t1 = DateTime.Now; var file = ConfigurationManager.AppSettings["WordsFile"]; trie = new Trie(File.ReadAllLines(file)); var t = new List <string>(); result = new Words(t); result.Tree = trie; var p = 10000; using (var db = new DomainsEntities()) { db.Database.CommandTimeout = 0; var com = "TRUNCATE TABLE [dbo].[tbSplit]"; db.Database.ExecuteSqlCommand(com); var m = db.tbNewDomains.Count() / p; int i = 0; while (SplitNames(i++) && i < m + 1) { GC.Collect(0, GCCollectionMode.Forced, true); } var t2 = Math.Round((decimal)DateTime.Now.Subtract(t1).TotalSeconds).ToString(); Console.WriteLine("Names split is executed in {0} sec", t2); // prepare keywords params com = "EXEC [dbo].[CreateDomainsTableKeys]"; db.Database.ExecuteSqlCommand(com); t2 = Math.Round((decimal)DateTime.Now.Subtract(t1).TotalSeconds).ToString(); Console.WriteLine("Keywords searches are filled {0} sec", t2); } }
private static void CreateMasterTable() { using (var db = new DomainsEntities()) { var t1 = DateTime.Now; db.Database.CommandTimeout = 0; var cnt = db.Database.SqlQuery <Int32>("EXEC [dbo].[CreateDomainsTable]").Single(); var t2 = Math.Round((decimal)DateTime.Now.Subtract(t1).TotalSeconds).ToString(); Console.WriteLine("New table ({0} records) is normalized in {1} sec", cnt.ToString(), t2); } }
private static bool SplitNames(int chunkNum) { using (var db = new DomainsEntities()) { db.Configuration.AutoDetectChangesEnabled = false; db.Configuration.ValidateOnSaveEnabled = false; var cnt = 0; var p = 10000; //var m = db.tbDomainsFromSrcs.Count() / p; var t0 = DateTime.Now; var list = db.tbNewDomains.Where(i => i.Id >= chunkNum * p && i.Id < (chunkNum + 1) * p); //var t1 = Math.Round((decimal)DateTime.Now.Subtract(t0).TotalSeconds).ToString(); //Console.WriteLine("list is created ({0})", t1); //t0 = DateTime.Now; // var dlist = new List <tbSplit>(); foreach (var d in list) { result.FindAndSplit(d.Name); var dn = new tbSplit(); dn.DomID = d.Id; dn.NameShown = result.FindBestIncSeparators(); dn.NameWords = result.SplitKeywords(dn.NameShown); dn.WordCount = result.BestItemCount; dlist.Add(dn); cnt++; } try { //db.tbSplits.AddRange(dlist); db.BulkInsert(dlist); //t1 = Math.Round((decimal)DateTime.Now.Subtract(t0).TotalSeconds).ToString(); //Console.WriteLine("List is inserted ({0})", t1); //t0 = DateTime.Now; db.SaveChanges(); //t1 = Math.Round((decimal)DateTime.Now.Subtract(t0).TotalSeconds).ToString(); //Console.WriteLine("List is saved ({0})", t1); //t0 = DateTime.Now; } catch (Exception ex) { var x = 1; } var t1 = Math.Round((decimal)DateTime.Now.Subtract(t0).TotalSeconds).ToString(); Console.WriteLine("{0}-th chunk is processed ({1})", (chunkNum + 1).ToString(), t1); } return(true); }
public static bool GetPage(DomainsEntities db, string urlStr, int n, int minPR) { try { var t1 = DateTime.Now; var test = true; WebClient webClient = new WebClient(); var p = Convert.ToString((n - 1) * 25); var url = string.Format(urlStr, p); string page = webClient.DownloadString(url); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(page); List <List <string> > table = doc.DocumentNode.SelectSingleNode("//table[@class='base1']") // responsive .Descendants("tr") .Skip(1) .Where(tr => tr.Elements("td").Count() > 1) .Select(tr => tr.Elements("td").Select(td => td.FirstChild.InnerText.Trim()).ToList()) .ToList(); var s = string.Empty; foreach (var item in table) { s = string.Format("{0}{1},{2}\n", s, item[0], item[1]); if (int.Parse(item[1]) < minPR) { test = false; break; } else { WriteToDB(db, item[0].ToLower(), item[1], item[2], item[4], item[6]); } } db.SaveChanges(); var tm = Math.Floor(DateTime.Now.Subtract(t1).TotalMilliseconds).ToString(); Console.WriteLine("{0} is processed in {1} ms", url, tm.ToString()); return(test); } catch (Exception ex) { Console.Write(ex.Message + '\n'); return(false); } }
private static void WriteToDB(DomainsEntities db, string dom, string prStr, string blStr, string yStr, string dmoz) { var test = db.tbGooglePRs.Where(i => i.Domain == dom).FirstOrDefault(); if (test == null) { try { var tb = new tbGooglePR(); tb.Domain = dom; bool res = false; int temp; res = int.TryParse(prStr, out temp); if (res) { tb.GooglePR = Convert.ToInt32(temp); } res = int.TryParse(yStr, out temp); if (res) { tb.Year = Convert.ToInt32(temp); tb.Archive = true; } res = int.TryParse(blStr, out temp); if (res && temp != 0) { tb.BackLinks = Convert.ToInt32(temp); } if (dmoz == "Yes") { tb.Dmoz = true; } db.tbGooglePRs.Add(tb); } catch (Exception ex) { } } }
private static void ImportGPR() { var t1 = DateTime.Now; string urlStr; var delay = 1000 * int.Parse(ConfigurationManager.AppSettings["AverageDelay"]); var p = int.Parse(ConfigurationManager.AppSettings["MinPR"]); bool test = true; var n = 1; var m = 0; using (var db = new DomainsEntities()) { var com = "TRUNCATE TABLE [dbo].[tbGooglePR]"; db.Database.ExecuteSqlCommand(com); using (StreamReader reader = new StreamReader(gprSource)) { while ((urlStr = reader.ReadLine()) != null) { n = 1; while (test) { test = GooglePR.GetPage(db, urlStr, n, p); Random r = new Random(); int sec = r.Next(delay); Thread.Sleep(sec); n++; } m++; } } var cnt = db.tbGooglePRs.Count(); var tm1 = Math.Floor(DateTime.Now.Subtract(t1).TotalSeconds).ToString(); Console.WriteLine("Completed all in {0} sec ({1} sources, {2} records)", tm1, Convert.ToString(m), cnt.ToString()); } }
public int LoadCsvDataIntoSqlServer(string fileName, bool withTruncate) { using (var db = new DomainsEntities()) { var srcs = db.tbSrcs.ToDictionary(g => g.Name, g => g.Id); var sts = db.tbStatus.ToDictionary(g => g.Name, g => g.Id); var createdCount = 0; var fn = string.Format("{0}\\{1}", _outputFolder, fileName); using (var textFieldParser = new TextFieldParser(fn)) { textFieldParser.TextFieldType = FieldType.Delimited; textFieldParser.Delimiters = new[] { "," }; textFieldParser.HasFieldsEnclosedInQuotes = true; var dataTable = new DataTable("tbDomainsFromSrc"); // Add the columns in the temp table dataTable.Columns.Add("Domain"); dataTable.Columns.Add("Price"); dataTable.Columns.Add("ExpDate");//, typeof(DateTime)); dataTable.Columns.Add("Source", typeof(Int32)); dataTable.Columns.Add("IdInSource"); dataTable.Columns.Add("Age"); dataTable.Columns.Add("Status", typeof(Int32)); dataTable.Columns.Add("SrcFile"); // dataTable.Columns.Add("Name"); dataTable.Columns.Add("Tld"); dataTable.Columns.Add("Length"); using (var sqlConnection = new SqlConnection(_connectionString)) { sqlConnection.Open(); if (withTruncate) { // Truncate the live table using (var sqlCommand = new SqlCommand(_truncateLiveTableCommandText, sqlConnection)) { sqlCommand.ExecuteNonQuery(); } } // Create the bulk copy object var sqlBulkCopy = new SqlBulkCopy(sqlConnection) { DestinationTableName = "tbDomainsFromSrc" }; // Setup the column mappings, anything ommitted is skipped sqlBulkCopy.ColumnMappings.Add("Domain", "Domain"); sqlBulkCopy.ColumnMappings.Add("Price", "Price"); sqlBulkCopy.ColumnMappings.Add("ExpDate", "ExpDate"); sqlBulkCopy.ColumnMappings.Add("Source", "Source"); sqlBulkCopy.ColumnMappings.Add("IdInSource", "IdInSource"); sqlBulkCopy.ColumnMappings.Add("Age", "Age"); sqlBulkCopy.ColumnMappings.Add("SrcFile", "SrcFile"); sqlBulkCopy.ColumnMappings.Add("Status", "Status"); // sqlBulkCopy.ColumnMappings.Add("Name", "Name"); sqlBulkCopy.ColumnMappings.Add("Tld", "Tld"); sqlBulkCopy.ColumnMappings.Add("Length", "Length"); // Loop through the CSV and load each set of 100,000 records into a DataTable // Then send it to the LiveTable while (!textFieldParser.EndOfData) { string[] temp = textFieldParser.ReadFields(); if (createdCount > 0) { temp[0] = temp[0].ToLower(); if (temp.Length > 7) { temp = MergeItemsInArray(temp, 1);// for price with comma } // add src temp = temp.AddItemToArray(fileName.Replace(".csv", "")); // prepare date temp[2] = GetDateValue(temp[2]); // src & status temp[3] = Convert.ToString(srcs[temp[3]]); temp[6] = Convert.ToString(sts[temp[6]]); // 3 calc fields var name = temp[0].Split('.')[0]; var tld = temp[0].Substring(temp[0].IndexOf('.') + 1); temp = temp.AddItemToArray(name); temp = temp.AddItemToArray(tld); temp = temp.AddItemToArray(name.Length.ToString()); //temp[temp.Length] = fileName; dataTable.Rows.Add(temp); //dataTable.Rows.Add(textFieldParser.ReadFields()); } createdCount++; if (createdCount % _batchSize == 0 && createdCount > 0) { InsertDataTable(sqlBulkCopy, sqlConnection, dataTable); //break; } } // Don't forget to send the last batch under 100,000 InsertDataTable(sqlBulkCopy, sqlConnection, dataTable); sqlConnection.Close(); return(createdCount); // dataTable.Rows.Count; } } } }