Beispiel #1
0
        static void Main(string[] args)
        {
            //Console.WriteLine("Input link: ");
            //String s = Console.ReadLine();
            //Console.WriteLine("Input Directory to save: ");
            //String d = Console.ReadLine();

            String startupPath = System.IO.Directory.GetCurrentDirectory();

            //startupPath = Path.Combine(startupPath, d);


            dsFiles = Path.Combine(startupPath, "Data");//Path to source files.
            //String dict = Path.Combine(startupPath,"LDict.txt");//Dictionary of lemmas

            /* STEP 1. PARSE HTML
             * ParserWorker<String[]> parser = new ParserWorker<String[]>(new NekdoParser());
             *
             * parser.SetSettings(new NekdoSettings(1,100));
             * parser.OnNewData += NewData;
             * parser.OnComplete += Complete;
             * dir = new DirectoryInfo(startupPath);
             * try{
             *  dir.Create();
             * }
             * catch(IOException){
             *  Console.WriteLine("This directory has already exist. Continue work with this directory");
             * }
             * parser.Start();
             * while(parser.IsActive()){//awaiting parser...
             *
             * }
             *
             *
             *
             * CreateIndexF(parser.GetUrls());
             */


            //STEP 2 STEMMING

            /*
             * TrainDataParser TDP = new TrainDataParser();
             *
             *
             * Lemmatization(TDP);
             *
             * Console.WriteLine("");
             */
            //STEP 3 CREATING INDEX.
            String indexFileP = Path.Combine(startupPath, "Indexer", "inventIndex.txt");

            Console.WriteLine("===STEP 3 ===");

            IndexBuilder builder = new IndexBuilder();

            Console.WriteLine("Source: {0} ", builder.Source);
            Console.WriteLine("Dest: {0}", indexFileP);


            LinkedDictionary <String, IndexEntry> indexer = builder.ReadData();//INDEX


            // UNCOMMENT FOR VECTOR RETRIEVAL (STEP 5)

            foreach (KeyValuePair <String, IndexEntry> p in indexer)
            {
                Double I = Math.Round(100.0 / p.Value.Ids.Count, 5);
                p.Value.IDF = I;//Math.Log(100.0/p.Value.Ids.Count, 10.0);

                foreach (Double prob in p.Value.Probs)
                {
                    p.Value.Weights.Add(prob * I); //tf(t,d)*idf(t,D) = tf-idf(t,d,D)
                }
                //String data = p.Key +" : "+ p.Value;
                //__CreateIFile(indexFileP, data);//read Data from indexer to file.
            }

            Console.WriteLine("Done.");



            IStemmer         stem   = new RussianStemmer();                                    //STEMMER
            BoolSyntaxParser bp     = new BoolSyntaxParser();                                  //PARSER OF BOOLEAN EXPRESSIONS
            ILemmatizer      lemmer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); //LEMMATIZER.


            //STEP 4. BOOLEAN SEARCH BY(indexer)

            /*
             * while(true){
             *  Console.WriteLine("Input search str...");
             *  String ui = Console.ReadLine();
             *
             *  String[] u = ui.ToLower().Replace('ё','е').Split(new Char[]{' ' , ',', '.', ';', '-', ':','?','!','\"'},StringSplitOptions.RemoveEmptyEntries);
             *  LinkedStack<String> ui_w =  bp.GetInput(u);//GET EXPRESSION IN POLISH NOTATION
             *
             *  String[] ui_wa = ui_w.ToArray();//SAVE IT INTO ARRAY
             *  foreach(String it2 in ui_wa){
             *      Console.WriteLine(it2);
             *  }
             *  SimpleTextCrawler.Structures.LinkedList<Int32> idsOf = __GetIds(lemmer, indexer, ui_wa);
             *  __FindLinks(idsOf);
             *
             * }*/


            //STEP 5 Vector SEARCH BY(indexer).

            ArrayHeap <HeapEntry> PQ = new ArrayHeap <HeapEntry>(x => x.Relevance);//HEAP SORT.

            Console.WriteLine("VECTOR SEARCH...\n");
            while (true)
            {
                PQ.Clear();
                Console.WriteLine("Input search str...");
                String   ui    = Console.ReadLine();
                Double[] score = new Double[101];
                //Double[] lengths = new Double[101];//ST_C
                Double[] lengths = builder.GetLens();//ST_UC
                Double   q_w     = 0.0;
                String[] u       = ui.ToLower().Replace('ё', 'е').Split(new Char[] { ' ', ',', '.', ';', '-', ':', '?', '!', '\"' }, StringSplitOptions.RemoveEmptyEntries);
                foreach (String t in u)
                {
                    IndexEntry te;
                    if (indexer.TryGetValue(lemmer.Lemmatize(t), out te))
                    {
                        q_w += te.IDF * te.IDF;
                        Int32 i = 1;
                        foreach (Int32 id in te.Ids)
                        {
                            score[id] += te.Weights[i];
                            //lengths[id] += te.Probs[i]*te.Probs[i];//ST_C
                            i++;
                        }
                    }
                }
                q_w = Math.Sqrt(q_w);
                if (q_w == 0.0)
                {
                    Console.WriteLine("NOT FOUND");
                }
                else
                {
                    for (Int32 k = 1; k < 101; k++)
                    {
                        if (lengths[k - 1] == 0) //ST_C
                        {
                            continue;            //ST_C
                        }
                        //lengths[k] = lengths[k] > 0 ? Math.Sqrt(lengths[k]) : 1;//ST_C
                        //score[k] = score[k]/(lengths[k]*q_w);//ST_C
                        score[k] = score[k] / (lengths[k - 1] * q_w);// 0 /1 => 0.
                        if (score[k] == 0.0)
                        {
                            continue;
                        }
                        PQ.Add(new HeapEntry()
                        {
                            Relevance = 1d / score[k], Id = k
                        });                                                      //ASC ORDER
                    }
                    SimpleTextCrawler.Structures.LinkedList <Int32> docIds = new SimpleTextCrawler.Structures.LinkedList <Int32>();
                    Int32 KM = 5;
                    while (!PQ.IsEmpty() && KM > 0)
                    {
                        HeapEntry et = PQ.DeleteMin();
                        Console.WriteLine("{0} : {1} ", et.Id, 1d / et.Relevance);
                        docIds.Add(et.Id);
                        KM--;
                    }
                    Console.WriteLine("");
                    __FindLinksV(docIds);
                }
            }
        }