public SearchEngineResult Parse(string html, Encoding encoding)
        {
            HTMLparser oP = HtmlParserFactory.GetInstance();
            searchResult = new SearchEngineResult();
            searchResult.SearchEngineType = SearchEngineType.Google;
            item = new SearchEngineResult.ResultItem();
            //item.Source = "Google";
            oP.Init(encoding.GetBytes(html));
            oP.SetEncoding(encoding);
            HTMLchunk oChunk = null;

            int state = 0;
            bool bEncodingSet = false;
            while ((oChunk = oP.ParseNext()) != null)
            {

                switch (oChunk.oType)
                {
                    case  HTMLchunkType.OpenTag:
                        HandleOpenTag(oChunk, ref state);

                printParams:
                        if (oChunk.sTag == "meta")
                        {
                            HandleMetaEncoding(oP, oChunk, ref bEncodingSet);
                        };
                    HandleParam(oChunk, ref state);

                    break;

                    case HTMLchunkType.CloseTag:
                    HandleCloseTag(oChunk, ref state);
                        break;

                    case HTMLchunkType.Text:
                        HandleText(oChunk, ref state);
                        break;

                    default:
                        break;
                }
            }
            return searchResult;
        }
Exemple #2
0
 private static SearchEngineResult GetSearchResult()
 {
     SearchEngineResult result = new SearchEngineResult();
     result.SearchEngineType = SearchEngineType.Other;
     SearchEngineResult.ResultItem item1 = new SearchEngineResult.ResultItem();
     item1.Title = "INFO";
     item1.CacheUrl = "http://aaa.www.com/";
     item1.Description = "LinTian" + " " + 1 + " Done.";
     item1.SimilarUrl = "http://similar.www.com/";
     item1.Url = "http://info.tsinghua.edu.cn";
     result.Results.Add(item1);
     SearchEngineResult.ResultItem item2 = new SearchEngineResult.ResultItem();
     item2.Title = "INFO2";
     item2.CacheUrl = "http://aaa.www.com22222/";
     item2.Description = "LunaR" + " " + 2 + " Done2.";
     item2.SimilarUrl = "http://similar.www.com2222/";
     item2.Url = "http://info.tsinghua.edu.cn2";
     result.Results.Add(item2);
     return result;
 }
        private void HandleCloseTag(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.sTag == "a" && state == 6)
            {
                state += 1;
            }
            else if (oChunk.sTag == "li" && state == 7)
            {
                state = 4;
                if (item.Url != null && item.Url != "")
                {
                    searchResult.Results.Add(item);
                    item = new SearchEngineResult.ResultItem();
                }

            }
            else if (oChunk.sTag == "ul" && state == 4)
            {
                state = -1;
            }
        }
        private void HandleParam(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.iParams > 0)
            {
                for (int i = 0; i < oChunk.iParams; i++)
                {
                    switch (oChunk.cParamChars[i])
                    {

                        default:
                            if (oChunk.sValues[i] == "g" && oChunk.sParams[i] == "class" && state == 2)
                            {
                                state = 3;
                                if (item.Url!=null && item.Url!="")
                                {
                                    searchResult.Results.Add(item);
                                    item = new SearchEngineResult.ResultItem();
                                    //item.Source = "Google";
                                }
                            }else if(oChunk.sValues[i] == "r" && oChunk.sParams[i] == "class" && state == 3)
                            {
                                state = 4;
                            }else if(oChunk.sValues[i] == "s" && oChunk.sParams[i] == "class" && state == 6)
                            {
                                state = 7;
                            }
                            else if (oChunk.sValues[i] == "gl" && oChunk.sParams[i] == "class" && state == 7)
                            {
                                state = 8;
                            }
                            else if (oChunk.sParams[i] == "href")
                            {
                                if (state == 5)
                                {
                                    item.Url = oChunk.sValues[i];
                                }
                                else if (state == 9 || state == 11)
                                {
                                    if (oChunk.sValues[i].IndexOf("q=related")!=-1)
                                    {
                                        item.SimilarUrl = oChunk.sValues[i];
                                    }
                                    else if (oChunk.sValues[i].IndexOf("q=cache") != -1)
                                    {
                                        item.CacheUrl = oChunk.sValues[i];
                                    }
                                }

                            }
                            break;
                    }
                }

            }
        }
        private void HandleParam(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.iParams > 0)
            {
                for (int i = 0; i < oChunk.iParams; i++)
                {
                    switch (oChunk.cParamChars[i])
                    {

                        default:
                            if (oChunk.sValues[i] == "f" && oChunk.sParams[i] == "class" && state == 2)
                            {
                                state = 3;
                                if (item.Url != null && item.Url != "")
                                {
                                    searchResult.Results.Add(item);
                                    item = new SearchEngineResult.ResultItem();
                                    //item.Source = "Baidu";
                                }
                            }
                            else if (oChunk.sParams[i] == "href")
                            {
                                if (state == 4)
                                {
                                    item.Url = oChunk.sValues[i];
                                }
                                else if (state == 6 || state == 8)
                                {
                                    if (oChunk.sValues[i].IndexOf("cache") != -1)
                                    {
                                        item.CacheUrl = oChunk.sValues[i];
                                    }
                                    else if (oChunk.sValues[i].StartsWith("s?cl=2"))
                                    {

                                        item.SimilarUrl = oChunk.sValues[i];
                                    }
                                }
                            }
                            break;
                    }
                }

            }
        }
        public static void Test()
        {
            QueryResultRecordManager manager = new QueryResultRecordManager("data", new TimeSpan(0, 1, 0));

            Pipeline.Pipeline pipeline = new Pipeline.Pipeline();
            pipeline.QueryResultSubscriberManager.AddSubscriber(manager);

            QueryResult result = new QueryResult(new InputQuery("Bill Gates"));
            SearchEngineResult resultItem = new SearchEngineResult();
            resultItem.SearchEngineType = SearchEngineType.Google;
            resultItem.SearchUrl = "http://www.google.com/query.jsp";
            SearchEngineResult.ResultItem item = new SearchEngineResult.ResultItem();
            item.Title = "ddd";
            item.Url = "http://www.gfw.com/";
            item.SimilarUrl = "http://www.g.com/ddd";
            item.CacheUrl = "http://www.g.com/cache";
            item.Description = "Who cares?";
            resultItem.Results.Add(item);

            SuggestionResult resultItem1 = new SuggestionResult();
            resultItem1.SuggestionType = SuggestionType.Google;
            resultItem1.SearchUrl = "json";
            SuggestionResult.ResultItem item1 = new SuggestionResult.ResultItem();
            item1.Index = "1";
            item1.Number = "2";
            item1.Suggestion = "haha";
            resultItem1.Results.Add( item1 );

            result.SearchEngineResultItems.Add(resultItem);
            result.SuggestionResultItems.Add(resultItem1);

            pipeline.OnQueryResultReady(result);

            Thread.Sleep(1000);

            List<QueryResult> resultList = manager.GetResultList(DateTime.Today, DateTime.Today);
            foreach (QueryResult queryResult in resultList)
            {
                Console.WriteLine(queryResult);
            }
            Console.WriteLine("----------------------------------------------");

            manager.RemoveAllResultList();

            resultList = manager.GetResultList(DateTime.Today, DateTime.Today);
            foreach (QueryResult queryResult in resultList)
            {
                Console.WriteLine(queryResult);
            }

            Console.WriteLine("----------------------------------------------");

            result = new QueryResult(new InputQuery("Bill Gates"));
            resultItem = new SearchEngineResult();
            resultItem.SearchEngineType = SearchEngineType.Google;
            resultItem.SearchUrl = "http://www.google.com/query.jsp";
            item = new SearchEngineResult.ResultItem();
            item.Title = "ddd";
            item.Url = "http://www.gfw.com/";
            item.SimilarUrl = "http://www.g.com/ddd";
            item.CacheUrl = "http://www.g.com/cache";
            item.Description = "Who cares?";
            resultItem.Results.Add(item);

            result.Items.Add(resultItem);

            pipeline.OnQueryResultReady(result);

            Thread.Sleep(1000);

            manager.RemoveResultListFromDate(DateTime.Today);
            resultList = manager.GetResultList(DateTime.Today - new TimeSpan(1, 0, 0, 0), DateTime.Today);
            foreach (QueryResult queryResult in resultList)
            {
                Console.WriteLine(queryResult);
            }

            Console.WriteLine("----------------------------------------------");

            manager.RemoveResultListFromDate(DateTime.Today + new TimeSpan(1, 0, 0, 0));
            resultList = manager.GetResultList(DateTime.Today - new TimeSpan(1, 0, 0, 0), DateTime.Today);
            foreach (QueryResult queryResult in resultList)
            {
                Console.WriteLine(queryResult);
            }

            Console.WriteLine("----------------------------------------------");
        }
 public IQueryResultItem Search( InputQuery query )
 {
     SearchEngineResult result = new SearchEngineResult();
     result.SearchEngineType = SearchEngineType.Other;
     SearchEngineResult.ResultItem item1 = new SearchEngineResult.ResultItem();
     item1.Title = "INFO";
     item1.CacheUrl = "http://aaa.www.com/";
     item1.Description = query + " " + _count + " Done.";
     item1.SimilarUrl = "http://similar.www.com/";
     item1.Url = "http://info.tsinghua.edu.cn";
     result.Results.Add(item1);
     _count++;
     Thread.Sleep(_waitTime);
     return result;
 }
        private void HandleParam(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.iParams > 0)
            {
                for (int i = 0; i < oChunk.iParams; i++)
                {
                    switch (oChunk.cParamChars[i])
                    {

                        default:
                            if (oChunk.sValues[i] == "f" && oChunk.sParams[i] == "class" && state == 2)
                            {
                                state = 3;
                                if (item.Url != null && item.Url != "")
                                {
                                    searchResult.Results.Add(item);
                                    item = new SearchEngineResult.ResultItem();
                                    //item.Source = "Sogou";
                                }
                            }
                            else if (oChunk.sParams[i] == "href")
                            {
                                if (state == 4)
                                {
                                    item.Url = oChunk.sValues[i];
                                }
                                else if (state == 7 )
                                {
                                    item.CacheUrl = oChunk.sValues[i];
                                }else if(state==10)
                                {
                                    item.SimilarUrl = oChunk.sValues[i];
                                }

                            }
                            else if (oChunk.sParams[i] == "id" && (state == 6 || state == 9))
                            {
                                if (oChunk.sValues[i].StartsWith("sogou_snapshot"))
                                {
                                    state = 7;
                                }
                                else if (oChunk.sValues[i].StartsWith("sogou_sis"))
                                {
                                    state = 10;
                                }
                            }
                            break;
                    }
                }

            }
        }