Ejemplo n.º 1
0
        //GET LINKS FROM docIds
        private static void __FindLinks(SimpleTextCrawler.Structures.LinkedList <Int32> docIds)
        {
            String path = Path.Combine(System.IO.Directory.GetCurrentDirectory(), "Data", "pindex.txt");

            Console.WriteLine(path);
            docIds = new SimpleTextCrawler.Structures.LinkedList <Int32>(((IEnumerable <Int32>)docIds).OrderBy(x => x));

            List <String> result = new List <String>();//IEnumerable result

            Int32 j = 1;

            String[] data = File.ReadAllLines(path, Encoding.UTF8);
            for (Int32 i = 0; i < data.Length && j <= docIds.Count; i++)
            {
                String[] words = data[i].ToLower().Split(new Char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                if (docIds[j] == Convert.ToInt32(words[0].Substring(0, words[0].Length - 1)))
                {
                    result.Add(words[1]);
                    j++;
                }
            }
            if (result.Count == 0)
            {
                Console.WriteLine("NOT FOUND");
                return;
            }
            Console.WriteLine("Links:\n");
            foreach (String l in result)
            {
                Console.WriteLine(l);
            }
        }
Ejemplo n.º 2
0
        private static void __FindLinksV(SimpleTextCrawler.Structures.LinkedList <Int32> docIds)
        {
            String        path   = Path.Combine(System.IO.Directory.GetCurrentDirectory(), "Data", "pindex.txt");
            List <String> result = new List <String>();//IEnumerable result

            String[] data = File.ReadAllLines(path, Encoding.UTF8);
            Int32    k    = 1;

            while (k < docIds.Count + 1)
            {
                Int32 i = 0;
                while (docIds[k] != i)
                {
                    i++;
                }
                if (i == 101)
                {
                    continue;
                }
                String[] words = data[i - 1].ToLower().Split(new Char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                if (docIds[k] == Convert.ToInt32(words[0].Substring(0, words[0].Length - 1)))
                {
                    result.Add(words[1]);
                }
                k++;
            }
            if (result.Count == 0)
            {
                Console.WriteLine("NOT FOUND");
                return;
            }
            Console.WriteLine("Links:\n");
            foreach (String l in result)
            {
                Console.WriteLine(l);
            }
        }
Ejemplo n.º 3
0
        static void Main(string[] args)
        {
            //Console.WriteLine("Input link: ");
            //String s = Console.ReadLine();
            //Console.WriteLine("Input Directory to save: ");
            //String d = Console.ReadLine();

            String startupPath = System.IO.Directory.GetCurrentDirectory();

            //startupPath = Path.Combine(startupPath, d);


            dsFiles = Path.Combine(startupPath, "Data");//Path to source files.
            //String dict = Path.Combine(startupPath,"LDict.txt");//Dictionary of lemmas

            /* STEP 1. PARSE HTML
             * ParserWorker<String[]> parser = new ParserWorker<String[]>(new NekdoParser());
             *
             * parser.SetSettings(new NekdoSettings(1,100));
             * parser.OnNewData += NewData;
             * parser.OnComplete += Complete;
             * dir = new DirectoryInfo(startupPath);
             * try{
             *  dir.Create();
             * }
             * catch(IOException){
             *  Console.WriteLine("This directory has already exist. Continue work with this directory");
             * }
             * parser.Start();
             * while(parser.IsActive()){//awaiting parser...
             *
             * }
             *
             *
             *
             * CreateIndexF(parser.GetUrls());
             */


            //STEP 2 STEMMING

            /*
             * TrainDataParser TDP = new TrainDataParser();
             *
             *
             * Lemmatization(TDP);
             *
             * Console.WriteLine("");
             */
            //STEP 3 CREATING INDEX.
            String indexFileP = Path.Combine(startupPath, "Indexer", "inventIndex.txt");

            Console.WriteLine("===STEP 3 ===");

            IndexBuilder builder = new IndexBuilder();

            Console.WriteLine("Source: {0} ", builder.Source);
            Console.WriteLine("Dest: {0}", indexFileP);


            LinkedDictionary <String, IndexEntry> indexer = builder.ReadData();//INDEX


            // UNCOMMENT FOR VECTOR RETRIEVAL (STEP 5)

            foreach (KeyValuePair <String, IndexEntry> p in indexer)
            {
                Double I = Math.Round(100.0 / p.Value.Ids.Count, 5);
                p.Value.IDF = I;//Math.Log(100.0/p.Value.Ids.Count, 10.0);

                foreach (Double prob in p.Value.Probs)
                {
                    p.Value.Weights.Add(prob * I); //tf(t,d)*idf(t,D) = tf-idf(t,d,D)
                }
                //String data = p.Key +" : "+ p.Value;
                //__CreateIFile(indexFileP, data);//read Data from indexer to file.
            }

            Console.WriteLine("Done.");



            IStemmer         stem   = new RussianStemmer();                                    //STEMMER
            BoolSyntaxParser bp     = new BoolSyntaxParser();                                  //PARSER OF BOOLEAN EXPRESSIONS
            ILemmatizer      lemmer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); //LEMMATIZER.


            //STEP 4. BOOLEAN SEARCH BY(indexer)

            /*
             * while(true){
             *  Console.WriteLine("Input search str...");
             *  String ui = Console.ReadLine();
             *
             *  String[] u = ui.ToLower().Replace('ё','е').Split(new Char[]{' ' , ',', '.', ';', '-', ':','?','!','\"'},StringSplitOptions.RemoveEmptyEntries);
             *  LinkedStack<String> ui_w =  bp.GetInput(u);//GET EXPRESSION IN POLISH NOTATION
             *
             *  String[] ui_wa = ui_w.ToArray();//SAVE IT INTO ARRAY
             *  foreach(String it2 in ui_wa){
             *      Console.WriteLine(it2);
             *  }
             *  SimpleTextCrawler.Structures.LinkedList<Int32> idsOf = __GetIds(lemmer, indexer, ui_wa);
             *  __FindLinks(idsOf);
             *
             * }*/


            //STEP 5 Vector SEARCH BY(indexer).

            ArrayHeap <HeapEntry> PQ = new ArrayHeap <HeapEntry>(x => x.Relevance);//HEAP SORT.

            Console.WriteLine("VECTOR SEARCH...\n");
            while (true)
            {
                PQ.Clear();
                Console.WriteLine("Input search str...");
                String   ui    = Console.ReadLine();
                Double[] score = new Double[101];
                //Double[] lengths = new Double[101];//ST_C
                Double[] lengths = builder.GetLens();//ST_UC
                Double   q_w     = 0.0;
                String[] u       = ui.ToLower().Replace('ё', 'е').Split(new Char[] { ' ', ',', '.', ';', '-', ':', '?', '!', '\"' }, StringSplitOptions.RemoveEmptyEntries);
                foreach (String t in u)
                {
                    IndexEntry te;
                    if (indexer.TryGetValue(lemmer.Lemmatize(t), out te))
                    {
                        q_w += te.IDF * te.IDF;
                        Int32 i = 1;
                        foreach (Int32 id in te.Ids)
                        {
                            score[id] += te.Weights[i];
                            //lengths[id] += te.Probs[i]*te.Probs[i];//ST_C
                            i++;
                        }
                    }
                }
                q_w = Math.Sqrt(q_w);
                if (q_w == 0.0)
                {
                    Console.WriteLine("NOT FOUND");
                }
                else
                {
                    for (Int32 k = 1; k < 101; k++)
                    {
                        if (lengths[k - 1] == 0) //ST_C
                        {
                            continue;            //ST_C
                        }
                        //lengths[k] = lengths[k] > 0 ? Math.Sqrt(lengths[k]) : 1;//ST_C
                        //score[k] = score[k]/(lengths[k]*q_w);//ST_C
                        score[k] = score[k] / (lengths[k - 1] * q_w);// 0 /1 => 0.
                        if (score[k] == 0.0)
                        {
                            continue;
                        }
                        PQ.Add(new HeapEntry()
                        {
                            Relevance = 1d / score[k], Id = k
                        });                                                      //ASC ORDER
                    }
                    SimpleTextCrawler.Structures.LinkedList <Int32> docIds = new SimpleTextCrawler.Structures.LinkedList <Int32>();
                    Int32 KM = 5;
                    while (!PQ.IsEmpty() && KM > 0)
                    {
                        HeapEntry et = PQ.DeleteMin();
                        Console.WriteLine("{0} : {1} ", et.Id, 1d / et.Relevance);
                        docIds.Add(et.Id);
                        KM--;
                    }
                    Console.WriteLine("");
                    __FindLinksV(docIds);
                }
            }
        }
Ejemplo n.º 4
0
        //COMPUTE BY VECTORS AND OPERATIONS IDS OF DOCUMENTS.
        private static SimpleTextCrawler.Structures.LinkedList <Int32> __GetIds(ILemmatizer lemmer, LinkedDictionary <String, IndexEntry> indexer, String[] expr)
        {
            SimpleTextCrawler.Structures.LinkedList <Int32> IDS = new SimpleTextCrawler.Structures.LinkedList <Int32>();
            LinkedStack <Boolean[]> V = new LinkedStack <Boolean[]>();
            Int32 i = 0;

            while (i < expr.Length)
            {
                if (__isUnOperator(expr[i]))
                {
                    if (V.IsEmpty())
                    {
                        Console.WriteLine("Error in Expression");
                        return(IDS);
                    }
                    Boolean[] vi = V.Top();
                    V.Pop();
                    for (Int32 j = 1; j < 101; j++)
                    {
                        vi[j] = !(vi[j]);
                    }
                    V.Push(vi);
                }
                else if (__isOperator(expr[i]))
                {
                    if (V.Count < 2)
                    {
                        Console.WriteLine("Error in Expression");
                        return(IDS);
                    }
                    Boolean[] o1 = V.Top();
                    V.Pop();
                    Boolean[] o2 = V.Top();
                    V.Pop();
                    Boolean[] r = new Boolean[101];
                    switch (expr[i])
                    {
                    case "and": {
                        for (Int32 j = 1; j < 101; j++)
                        {
                            r[j] = o1[j] && o2[j];
                        }
                        break;
                    }

                    case "or": {
                        for (Int32 j = 1; j < 101; j++)
                        {
                            r[j] = o1[j] || o2[j];
                        }
                        break;
                    }

                    default: {
                        for (Int32 j = 1; j < 101; j++)
                        {
                            r[j] = o1[j] && o2[j];
                        }
                        break;
                    }
                    }
                    V.Push(r);
                }
                else
                {
                    //Console.WriteLine("Lemma: "+lemmer.Lemmatize(expr[i]));
                    V.Push(__GetBVector(indexer, lemmer.Lemmatize(expr[i])));
                    //Console.WriteLine("added");
                }
                i++;
            }
            if (V.IsEmpty())
            {
                Console.WriteLine("Error in Expression");
                return(IDS);
            }
            Boolean[] r_v = V.Top();
            V.Pop();
            for (Int32 d = 1; d < 101; d++)
            {
                if (r_v[d])
                {
                    IDS.Add(d);
                }
            }
            return(IDS);
        }