Esempio n. 1
0
        /// <summary>
        /// run the engine, control all the classes
        /// </summary>
        /// <param name="path"></param>
        public void Engine(string path, string finalPath, bool stem)
        {
            Stemmer  stemmer     = new Stemmer();
            ReadFile rf          = new ReadFile(path);
            Parser   parser      = new Parser(rf.ReadStopWords(path + "\\stop_words.txt"));
            int      filesAmount = rf.FilesAmount();
            Document currentDoc  = null;
            string   tempPath1   = @"./temp Posting Files1";
            string   tempPath2   = @"./temp Posting Files2";

            Directory.CreateDirectory(tempPath1);
            Directory.CreateDirectory(tempPath2);
            Directory.CreateDirectory(finalPath);
            string[] filesInTmp1 = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories);
            for (int i = 0; i < filesInTmp1.Length; i++)
            {
                File.Delete(filesInTmp1[i]);
            }
            string[] filesInTmp2 = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories);
            for (int i = 0; i < filesInTmp2.Length; i++)
            {
                File.Delete(filesInTmp2[i]);
            }
            DirectoryInfo di           = new DirectoryInfo(path);
            long          size         = di.EnumerateFiles("*", SearchOption.AllDirectories).Sum(fi => fi.Length);
            long          avgFilesSize = size / filesAmount;
            long          tenPrecent   = (size * 9) / 100;
            long          numFiles     = tenPrecent / avgFilesSize;
            int           count        = 0;

            //numFiles = 20;
            for (int i = 0; i < filesAmount; i++) //going through the files in the dictionary and send each to the parser
            {
                Match matchTEXT = rf.Seperate(i); // get a sperated files from red file
                while (matchTEXT.Success)
                {
                    Term[] terms = parser.Parse(matchTEXT.Groups[1].Value).Values.ToArray();
                    int    max   = -1;
                    if (stem)
                    {
                        for (int j = 0; j < terms.Length; j++)
                        {
                            terms[j].SetName(stemmer.stemTerm(terms[j].GetName()));
                        }
                    }
                    indexer.PrepareToPosting(terms, currentDoc = parser.GetDoc());
                    for (int j = 0; j < terms.Length; j++)
                    {
                        int currentTF = terms[j].GetTF(currentDoc);
                        if (currentTF > max)
                        {
                            max = currentTF;
                        }
                    }
                    currentDoc.SetMaxTF(max);
                    currentDoc.SetLength(terms.Length);
                    string[] details = new string[4];
                    details[0] = currentDoc.GetMaxTfString();
                    details[1] = currentDoc.GetLengthString();
                    details[2] = currentDoc.GetDateString();
                    details[3] = "";
                    DocDictionary.Add(currentDoc.GetName(), details);
                    matchTEXT = matchTEXT.NextMatch();
                }
                count++;
                if (count == numFiles)
                {
                    Console.WriteLine("create posting");
                    indexer.CreateTempPostingFile(tempPath1);
                    count = 0;
                }
            }//for
            if (count > 0)// if we finished the for and there are still terms in the hash
            {
                indexer.CreateTempPostingFile(tempPath1);
            }
            int temporarlyPostingFolder1 = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories).Length;
            int temporarlyPostingFolder2 = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories).Length;

            //continue until there is only two files
            while (!(temporarlyPostingFolder1 == 2 && temporarlyPostingFolder2 == 0) || !(temporarlyPostingFolder1 == 0 && temporarlyPostingFolder2 == 2))
            {
                indexer.SetPostingNumber(0);
                Merge(tempPath1, tempPath2);
                temporarlyPostingFolder1 = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories).Length;
                temporarlyPostingFolder2 = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories).Length;
                if (temporarlyPostingFolder1 == 0 && temporarlyPostingFolder2 == 2)
                {
                    string[] temporarlyPostingFolder = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories);
                    indexer.FinalMerge(temporarlyPostingFolder[0], temporarlyPostingFolder[1], finalPath, stem);
                    File.Delete(temporarlyPostingFolder[0]);
                    File.Delete(temporarlyPostingFolder[1]);
                    break;
                }
                indexer.SetPostingNumber(0);
                Merge(tempPath2, tempPath1);
                temporarlyPostingFolder1 = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories).Length;
                temporarlyPostingFolder2 = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories).Length;
                if (temporarlyPostingFolder1 == 2 && temporarlyPostingFolder2 == 0)
                {
                    string[] temporarlyPostingFolder = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories);
                    indexer.FinalMerge(temporarlyPostingFolder[0], temporarlyPostingFolder[1], finalPath, stem);
                    File.Delete(temporarlyPostingFolder[0]);
                    File.Delete(temporarlyPostingFolder[1]);
                    break;
                }
            }
            theDictionary = indexer.GetFinalDic();
            //cach

            /*     string path1 = finalPath + "\\Poodle_Dictionary";
             *   StreamReader file1 = new StreamReader(path1);
             *   while (!file1.EndOfStream)
             *   {
             *       string line = file1.ReadLine();
             *       StringBuilder sb = new StringBuilder();
             *       for (int i = 0; i < line.IndexOf("total tf:") - 1; i++)
             *       {
             *           sb.Append(line[i]);
             *       }
             *       string name = sb.ToString();
             *       String[] details = new string[4];
             *       int count1 = 0;
             *       for (int i = line.IndexOf("total tf:") + 10; i < line.Length; i++)
             *       {
             *           if (line[i] != '~')
             *               sb.Append(line[i]);
             *           else
             *           {
             *               details[count1] = sb.ToString();
             *               sb.Clear();
             *           }
             *
             *       }
             *       //  string[] data = line.Split('~');
             *       // String[] details = { data[1], data[2], data[3], data[4] };
             *       theDictionary.Add(name, details);
             *   }*/

            List <KeyValuePair <string, string[]> > tempDic = theDictionary.ToList();

            tempDic = tempDic.OrderByDescending(a => Int32.Parse(a.Value[0])).ToList();//sort by max tf
            for (int i = 0; i < 10000; i++)
            {
                string       pathtToPosting = Path.Combine(finalPath, theDictionary[tempDic[i].Key][2]);
                FileStream   fs             = new FileStream(pathtToPosting, FileMode.Open, FileAccess.Read);
                BinaryReader br             = new BinaryReader(fs);
                br.BaseStream.Seek(Int64.Parse(theDictionary[tempDic[i].Key][3]), SeekOrigin.Begin);
                cache.Add(tempDic[i].Key, ReadLine(br));
            }
            tempDic.Clear();
            Save(finalPath, stem);
        }//engine
        public static Dictionary <string, int[]> remove(KeyValuePair <List <string>, List <bool> > pair, string docname, string filename)
        {
            int maxTf = 0;

            int[] temp;
            Dictionary <string, int[]> dic = new Dictionary <string, int[]>();

            if (stemB)
            {
                Stemmer stem = new Stemmer();
                for (int i = pair.Key.Count - 1; i >= 0; i--)
                {
                    pair.Key[i] = stem.stemTerm(pair.Key[i]);

                    if (dic.ContainsKey(pair.Key[i]))
                    {
                        if (pair.Value[i])
                        {
                            dic[pair.Key[i]][0]++;
                        }
                        else
                        {
                            dic[pair.Key[i]][1]++;
                        }

                        //docfile
                        temp = dic[pair.Key[i]];
                        if (temp[0] + temp[1] > maxTf)
                        {
                            maxTf = temp[0] + temp[1];
                        }
                    }
                    else
                    {
                        dic[pair.Key[i]] = new int[2];
                        if (pair.Value[i])
                        {
                            dic[pair.Key[i]][0]++;
                        }
                        else
                        {
                            dic[pair.Key[i]][1]++;
                        }

                        //docfile
                        if (1 > maxTf)
                        {
                            maxTf = 1;
                        }
                    }
                }
            }
            else
            {
                for (int i = pair.Key.Count - 1; i >= 0; i--)
                {
                    if (dic.ContainsKey(pair.Key[i]))
                    {
                        if (pair.Value[i])
                        {
                            dic[pair.Key[i]][0]++;
                        }
                        else
                        {
                            dic[pair.Key[i]][1]++;
                        }

                        //docfile
                        temp = dic[pair.Key[i]];
                        if (temp[0] + temp[1] > maxTf)
                        {
                            maxTf = temp[0] + temp[1];
                        }
                    }
                    else
                    {
                        dic[pair.Key[i]] = new int[2];
                        if (pair.Value[i])
                        {
                            dic[pair.Key[i]][0]++;
                        }
                        else
                        {
                            dic[pair.Key[i]][1]++;
                        }

                        //docfile
                        if (1 > maxTf)
                        {
                            maxTf = 1;
                        }
                    }
                }
            }



            DocDic.Add(docname, filename + "," + maxTf + "," + pair.Key.Count);

            return(dic);
        }