Ejemplo n.º 1
0
 private void ParseUri(MyUri uri, ref MyWebRequest request)
 {
     string str = "";
     if ((request != null) && request.response.KeepAlive)
     {
         str = str + "连接转至: " + uri.Host + "\r\n\r\n";
     }
     else
     {
         str = str + "连接: " + uri.Host + "\r\n\r\n";
     }
     ListViewItem item = null;
     Monitor.Enter(this.listViewThreads);
     try
     {
         item = this.listViewThreads.Items[int.Parse(Thread.CurrentThread.Name)];
         item.SubItems[1].Text = uri.Depth.ToString();
         item.ImageIndex = 1;
         item.BackColor = System.Drawing.Color.WhiteSmoke;
         item.SubItems[2].Text = "正在连接";
         item.ForeColor = System.Drawing.Color.Red;
         item.SubItems[3].Text = uri.AbsoluteUri;
         item.SubItems[4].Text = "";
         item.SubItems[5].Text = "";
     }
     catch (Exception)
     {
     }
     Monitor.Exit(this.listViewThreads);
     try
     {
         object obj2;
         request = MyWebRequest.Create(uri, request, this.KeepAlive);
         request.Timeout = this.RequestTimeout * 0x3e8;
         MyWebResponse response = request.GetResponse();
         str = str + request.Header + response.Header;
         if (!response.ResponseUri.Equals(uri))
         {
             this.EnqueueUri(new MyUri(response.ResponseUri.AbsoluteUri), true);
             obj2 = str;
             str = string.Concat(new object[] { obj2, "重定向到: ", response.ResponseUri, "\r\n" });
             request = null;
         }
         else
         {
             if ((!this.AllMIMETypes && (response.ContentType != null)) && (this.MIMETypes.Length > 0))
             {
                 string str2 = response.ContentType.ToLower();
                 int index = str2.IndexOf(';');
                 if (index != -1)
                 {
                     str2 = str2.Substring(0, index);
                 }
                 if ((str2.IndexOf('*') == -1) && ((index = this.MIMETypes.IndexOf(str2)) == -1))
                 {
                     this.LogError(uri.AbsoluteUri, str + "\r\nUnlisted Content-Type (" + str2 + "), check settings.");
                     request = null;
                     return;
                 }
                 Match match = new Regex(@"\d+").Match(this.MIMETypes, index);
                 int num3 = int.Parse(match.Value) * 0x400;
                 int num4 = int.Parse(match.NextMatch().Value) * 0x400;
                 if ((num3 < num4) && ((response.ContentLength < num3) || (response.ContentLength > num4)))
                 {
                     this.LogError(uri.AbsoluteUri, string.Concat(new object[] { str, "\r\nContentLength limit error (", response.ContentLength, ")" }));
                     request = null;
                     return;
                 }
             }
             string[] strArray = new string[] { ".gif", ".jpg", ".css", ".zip", ".exe" };
             bool flag = true;
             foreach (string str3 in strArray)
             {
                 if (uri.AbsoluteUri.ToLower().EndsWith(str3))
                 {
                     flag = false;
                     break;
                 }
             }
             foreach (string str3 in this.ExcludeFiles)
             {
                 if ((str3.Trim().Length > 0) && uri.AbsoluteUri.ToLower().EndsWith(str3))
                 {
                     flag = false;
                     break;
                 }
             }
             string strBody = uri.ToString();
             if (this.Compared(uri.LocalPath.Substring(uri.LocalPath.LastIndexOf('.') + 1).ToLower()) && (uri.ToString().Substring(uri.ToString().Length - 1, 1) != "/"))
             {
                 this.LogError("丢弃--非网页文件", strBody);
             }
             else
             {
                 int num5;
                 UriKind absolute = UriKind.Absolute;
                 if (!string.IsNullOrEmpty(strBody) && Uri.IsWellFormedUriString(strBody, absolute))
                 {
                     string page = GetPage(strBody);
                     Stopwatch stopwatch = new Stopwatch();
                     stopwatch.Start();
                     Html html = new Html {
                         Web = page,
                         Url = strBody
                     };
                     CommonAnalyze analyze = new CommonAnalyze();
                     analyze.LoadHtml(html);
                     Net.LikeShow.ContentAnalyze.Document result = analyze.GetResult();
                     stopwatch.Stop();
                     string bt = result.Title.Replace("[(title)]", "");
                     switch (bt)
                     {
                         case null:
                         case "":
                             bt = result.Doc.Substring(20).ToString();
                             break;
                     }
                     if ((result.Doc == null) || (result.Doc == ""))
                     {
                         this.LogError("丢弃--空内容或非内空页", strBody);
                     }
                     else
                     {
                         Lucene.Net.Documents.Document document3;
                         string str7 = result.Doc + bt;
                         if (this.cgcount >= 10)
                         {
                             string keywords = this.MD5string(result.Doc.ToString());
                             string keyWordsSplitBySpace = "";
                             IndexSearcher searcher = new IndexSearcher(this.path);
                             keyWordsSplitBySpace = GetKeyWordsSplitBySpace(keywords, new KTDictSegTokenizer());
                             Query query = new QueryParser("J_md5_bai", new KTDictSegAnalyzer(true)).Parse(keyWordsSplitBySpace);
                             if (searcher.Search(query).Doc(0).Get("J_md5_bai") == keywords)
                             {
                                 this.LogError("排除--重复", strBody);
                             }
                             else
                             {
                                 this.cgcount++;
                                 this.LogUri(bt, "引索完成");
                                 document3 = new Lucene.Net.Documents.Document();
                                 document3.Add(new Field("分类", this.page_py, Field.Store.YES, Field.Index.TOKENIZED));
                                 document3.Add(new Field("J_title_bai", bt, Field.Store.YES, Field.Index.TOKENIZED));
                                 document3.Add(new Field("J_msgContent_bai", str7, Field.Store.YES, Field.Index.TOKENIZED));
                                 document3.Add(new Field("J_SiteType_bai", result.SiteType.ToString(), Field.Store.YES, Field.Index.NO));
                                 document3.Add(new Field("J_URL_bai", strBody, Field.Store.YES, Field.Index.NO));
                                 document3.Add(new Field("J_addtime_bai", DateTime.Now.ToShortDateString(), Field.Store.YES, Field.Index.NO));
                                 document3.Add(new Field("J_md5_bai", this.MD5string(result.Doc.ToString()), Field.Store.YES, Field.Index.TOKENIZED));
                                 this.writer.AddDocument(document3);
                             }
                         }
                         else
                         {
                             this.cgcount++;
                             this.LogUri(bt, "引索完成");
                             document3 = new Lucene.Net.Documents.Document();
                             document3.Add(new Field("分类", this.page_py, Field.Store.YES, Field.Index.TOKENIZED));
                             document3.Add(new Field("J_title_bai", bt, Field.Store.YES, Field.Index.TOKENIZED));
                             document3.Add(new Field("J_msgContent_bai", str7, Field.Store.YES, Field.Index.TOKENIZED));
                             document3.Add(new Field("J_SiteType_bai", result.SiteType.ToString(), Field.Store.YES, Field.Index.NO));
                             document3.Add(new Field("J_URL_bai", strBody, Field.Store.YES, Field.Index.NO));
                             document3.Add(new Field("J_addtime_bai", DateTime.Now.ToShortDateString(), Field.Store.YES, Field.Index.NO));
                             document3.Add(new Field("J_md5_bai", this.MD5string(result.Doc.ToString()), Field.Store.YES, Field.Index.TOKENIZED));
                             this.writer.AddDocument(document3);
                         }
                     }
                 }
                 item.SubItems[2].Text = "正在下载";
                 item.ForeColor = System.Drawing.Color.Black;
                 string input = "";
                 byte[] buffer = new byte[0x2800];
                 int nNum = 0;
                 while ((num5 = response.socket.Receive(buffer, 0, 0x2800, SocketFlags.None)) > 0)
                 {
                     nNum += num5;
                     if (flag)
                     {
                         input = input + Encoding.ASCII.GetString(buffer, 0, num5);
                     }
                     item.SubItems[4].Text = this.Commas(nNum);
                     if (response.ContentLength > 0)
                     {
                         item.SubItems[5].Text = '%' + ((100 - (((response.ContentLength - nNum) * 100) / response.ContentLength))).ToString();
                     }
                     if ((response.KeepAlive && (nNum >= response.ContentLength)) && (response.ContentLength > 0))
                     {
                         break;
                     }
                 }
                 if (response.KeepAlive)
                 {
                     str = str + "Connection kept alive to be used in subpages.\r\n";
                 }
                 else
                 {
                     response.Close();
                     str = str + "Connection closed.\r\n";
                 }
                 this.FileCount++;
                 this.ByteCount += nNum;
                 if ((this.ThreadsRunning && flag) && (uri.Depth < this.WebDepth))
                 {
                     str = str + "\r\nParsing page ...\r\n";
                     string pattern = "(href|HREF|src|SRC)[ ]*=[ ]*[\"'][^\"'#>]+[\"']";
                     MatchCollection matchs = new Regex(pattern).Matches(input);
                     obj2 = str;
                     str = string.Concat(new object[] { obj2, "Found: ", matchs.Count, " ref(s)\r\n" });
                     this.URLCount += matchs.Count;
                     foreach (Match match in matchs)
                     {
                         pattern = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim(new char[] { '"', '\'', '#', ' ', '>' });
                         try
                         {
                             if (!(((pattern.IndexOf("..") == -1) && !pattern.StartsWith("/")) && pattern.StartsWith("http://")))
                             {
                                 pattern = new Uri(uri, pattern).AbsoluteUri;
                             }
                             this.Normalize(ref pattern);
                             MyUri uri2 = new MyUri(pattern);
                             if ((((uri2.Scheme != Uri.UriSchemeHttp) && (uri2.Scheme != Uri.UriSchemeHttps)) || ((uri2.Host.Split(new char[] { '.' })[1] != this.urllhost[1]) && this.KeepSameServer)) || !this.Compared_jpg(uri2.LocalPath.Substring(uri2.LocalPath.LastIndexOf('.') + 1).ToLower()))
                             {
                                 continue;
                             }
                             Global.URL = uri2.ToString();
                             if ((Global.BXBH != "") && (Redspider_link.bxbh() == 2))
                             {
                                 continue;
                             }
                             uri2.Depth = uri.Depth + 1;
                             if (this.EnqueueUri(uri2, true))
                             {
                                 str = str + uri2.AbsoluteUri + "\r\n";
                             }
                         }
                         catch (Exception)
                         {
                         }
                     }
                 }
             }
         }
     }
     catch (Exception exception)
     {
         this.LogError(uri.AbsoluteUri, str + exception.Message);
         request = null;
     }
     finally
     {
         this.EraseItem(item);
     }
 }
Ejemplo n.º 2
0
 public void wyfx(string urll)
 {
     string str = urll;
     UriKind absolute = UriKind.Absolute;
     if (!string.IsNullOrEmpty(str) && Uri.IsWellFormedUriString(str, absolute))
     {
         string page = GetPage(str);
         Stopwatch stopwatch = new Stopwatch();
         stopwatch.Start();
         Html html = new Html {
             Web = page,
             Url = str
         };
         CommonAnalyze analyze = new CommonAnalyze();
         analyze.LoadHtml(html);
         Net.LikeShow.ContentAnalyze.Document result = analyze.GetResult();
         stopwatch.Stop();
         string bt = result.Title.Replace("[(title)]", "");
         if ((result.Doc == null) || (result.Doc == ""))
         {
             this.LogUri(bt, "丢弃--空内容");
         }
         else if ((bt == null) || (bt == ""))
         {
             bt = result.Doc.Substring(20).ToString();
         }
         else
         {
             this.LogUri(bt, "引索完成");
             Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
             doc.Add(new Field("J_title_bai", bt, Field.Store.YES, Field.Index.TOKENIZED));
             doc.Add(new Field("J_msgContent_bai", result.Doc, Field.Store.YES, Field.Index.TOKENIZED));
             doc.Add(new Field("J_URL_bai", DateTime.Now.ToShortDateString(), Field.Store.YES, Field.Index.NO));
             this.writer.AddDocument(doc);
         }
     }
     Console.Read();
 }