public void Write2DB(UrlElem oUrlElem) { SqlCommand oSqlChk = new SqlCommand("select * from UrlData where url = '" + oUrlElem.szUrl + "'", oSqlCon); SqlDataReader oDataReader = oSqlChk.ExecuteReader(); bool k = oDataReader.Read(); oDataReader.Close(); if (k) { return; } SqlCommand oSqlCmd = new SqlCommand("insert UrlData(url,title,html) values('" + oUrlElem.szUrl + "','" + oUrlElem.szTitle + "','" + " " + "')", oSqlCon); //oUrlElem.szTitle + "','" + oUrlElem.szText + "')", oSqlCon); oSqlCmd.ExecuteNonQuery(); }
private void AnalyzeContent(string szPageUrl, string szContent) { string title; string link; Uri oUri = new Uri(szPageUrl); Regex oLinkReg = new Regex("<a(\\s+.+?\\s+|\\s+)href\\s*=\\s*\"?(.*?)[\"|>]", RegexOptions.IgnoreCase | RegexOptions.Compiled); Regex oTitleReg = new Regex("<title>((.|\\s)*?)</title>", RegexOptions.Compiled | RegexOptions.IgnoreCase); MatchCollection oMatchCol = oLinkReg.Matches(szContent); Match oMatchTitle = oTitleReg.Match(szContent); title = oMatchTitle.Groups[1].Value; title = title.Trim(); // UrlElem urlElem = new UrlElem(title, szPageUrl, szContent.Replace("\r\n", " ").Replace('\'','\"')); s_oElemEnQueSema.WaitOne(); s_oUrlElemQue.Enqueue(urlElem); s_oElemDeQueSema.Release(); // s_oUrlElemList.Add(urlElem); foreach (Match omatch in oMatchCol) { link = omatch.Groups[2].Value; link = link.Trim(); if (link.Length < 1) { continue; } // inner link if (link.StartsWith("#")) { continue; } // Email Address if (link.IndexOf("mailto:") != -1) { continue; } // to be varified if (link.ToLower().IndexOf("javascript") != -1) { continue; } if (link.IndexOf("://") == -1) { if (link.StartsWith("/")) { // 转换成绝对路径 link = "http://" + oUri.Host + ":" + oUri.Port + link; } else { String file = oUri.AbsoluteUri; if (file.IndexOf('/') == -1) { // 处理相对地址 link = "http://" + oUri.Host + ":" + oUri.Port + "/" + link; } else { String path = file.Substring(0, file.LastIndexOf('/') + 1); //link = "http://" + oUri.Host + ":" // + oUri.Port + path + link; link = path + link; } } } int index; if ((index = link.IndexOf('#')) != -1) { link = link.Substring(0, index); } if ((link = FormatURL(link)) == null) { continue; } if (s_oURLSet.Contains(link)) { continue; } s_oURLSet.Add(link); // EnQueue s_oUrlEnQueSema.WaitOne(); s_oURLQue.Enqueue(link); s_oUrlDeQueSema.Release(); } }
private void AnalyzeContent(string szPageUrl, string szContent) { string title; string link; Uri oUri = new Uri(szPageUrl); Regex oLinkReg = new Regex("<a(\\s+.+?\\s+|\\s+)href\\s*=\\s*\"?(.*?)[\"|>]", RegexOptions.IgnoreCase | RegexOptions.Compiled); Regex oTitleReg = new Regex("<title>((.|\\s)*?)</title>", RegexOptions.Compiled | RegexOptions.IgnoreCase); MatchCollection oMatchCol = oLinkReg.Matches(szContent); Match oMatchTitle = oTitleReg.Match(szContent); title = oMatchTitle.Groups[1].Value; title = title.Trim(); // UrlElem urlElem = new UrlElem(title, szPageUrl, szContent.Replace("\r\n", " ").Replace('\'', '\"')); s_oElemEnQueSema.WaitOne(); s_oUrlElemQue.Enqueue(urlElem); s_oElemDeQueSema.Release(); // s_oUrlElemList.Add(urlElem); foreach (Match omatch in oMatchCol) { link = omatch.Groups[2].Value; link = link.Trim(); if (link.Length < 1) { continue; } // inner link if (link.StartsWith("#")) { continue; } // Email Address if (link.IndexOf("mailto:") != -1) { continue; } // to be varified if (link.ToLower().IndexOf("javascript") != -1) { continue; } if (link.IndexOf("://") == -1) { if (link.StartsWith("/")) { // 转换成绝对路径 link = "http://" + oUri.Host + ":" + oUri.Port + link; } else { String file = oUri.AbsoluteUri; if (file.IndexOf('/') == -1) { // 处理相对地址 link = "http://" + oUri.Host + ":" + oUri.Port + "/" + link; } else { String path = file.Substring(0, file.LastIndexOf('/') + 1); //link = "http://" + oUri.Host + ":" // + oUri.Port + path + link; link = path + link; } } } int index; if ((index = link.IndexOf('#')) != -1) { link = link.Substring(0, index); } if ((link = FormatURL(link)) == null) { continue; } if (s_oURLSet.Contains(link)) { continue; } s_oURLSet.Add(link); // EnQueue s_oUrlEnQueSema.WaitOne(); s_oURLQue.Enqueue(link); s_oUrlDeQueSema.Release(); } }