예제 #1
0
        private void loadDictionaryButton_Click()
        {
            string dictPath = @"C:\Users\ירדן\Desktop\לימודים\שנה ד\סמס1\אחזור מידע\data\posting";
            bool   stemm    = false;

            d    = new Dictionary <string, TermInfo>();
            docs = new Dictionary <string, DocumentInfo>();
            string p = "";

            if (stemm)
            {
                p = dictPath + @"\DictionaryStemm.txt";
            }
            else
            {
                p = dictPath + @"\Dictionary.txt";
            }
            if (!File.Exists(p))
            {
                System.Windows.Forms.MessageBox.Show("Dictionary not found !");
            }
            else
            {
                Thread tDocs = new Thread(() => loadDocs(dictPath, stemm));
                tDocs.Start();
                string[] tempDic = File.ReadAllLines(p);
                for (int i = 0; i < tempDic.Length; i += 2)
                {
                    string[] t    = tempDic[i].Split(',');
                    string   term = t[0];
                    TermInfo ti   = new TermInfo(t[0], int.Parse(t[1]), int.Parse(t[2]), t[3][0]);
                    ti.setPointer(int.Parse(t[4]));

                    t = tempDic[i + 1].Split(',');
                    foreach (string next in t)
                    {
                        ti.nextString.Add(next);
                    }
                    d[term] = ti;
                }
                tDocs.Join();

                System.Windows.Forms.MessageBox.Show("load data succeeded");
            }
        }
예제 #2
0
        /// <summary>
        /// build mini postings for dictionary
        /// </summary>
        /// <param name="terms"></param>
        public void buildInvertedIndex(Dictionary <string, TermInfo> terms)
        {
            List <string> sortTerms = terms.Keys.ToList();

            sortTerms.Sort();
            string fullPath = path + @"\" + idx + ".txt";

            using (FileStream fs = File.Create(fullPath))
            {
                using (StreamWriter sw = new StreamWriter(fs))
                {
                    string newL = "";
                    foreach (string t in sortTerms)
                    {
                        //  newL = String.Join(",", terms[t].locations);
                        newL = terms[t].getStringOfDocs();
                        sw.WriteLine(t);
                        sw.Write(newL);
                        sw.WriteLine();

                        //Dictionary :
                        int listSize     = terms[t].locations.Count;
                        int frecCurrPost = terms[t].corpusF;
                        if (fullD.ContainsKey(t))
                        {
                            fullD[t].docF    += listSize;
                            fullD[t].corpusF += frecCurrPost;
                            fullD[t].addNextString(terms[t].nextStringFull);
                        }
                        else
                        {
                            TermInfo newT = new TermInfo(t, listSize, frecCurrPost, terms[t].type);
                            newT.nextStringFull = terms[t].nextStringFull;
                            fullD[t]            = newT;
                        }
                    }
                    terms.Clear();
                }
            }

            idx++;
        }
예제 #3
0
        /// <summary>
        /// read amount of files, for each file split to docs, parse the document, and build mini dictionary
        /// </summary>
        /// <param name="amount"></param>
        /// <returns></returns>
        public Dictionary <string, TermInfo> readBatch(int amount)
        {
            if (files.Length > 0)
            {
                Dictionary <string, TermInfo> termsInFiles = new Dictionary <string, TermInfo>();
                for (int i = 0; i < amount; i++)
                {
                    string        text = System.IO.File.ReadAllText(files[idxFile]);
                    List <string> docs = getDocs(text);
                    foreach (string doc in docs)
                    {
                        parser = new Parse(stopWords, stemm);
                        Dictionary <string, int> termsInCurrDoc = parser.parseDoc(doc); // term and his tf from the doc
                        DocumentInfo             d = parser.getDoc();                   // doc details
                        d.numTermsInDoc = parser.countTerms;
                        #region doc FBIS-3366

                        /*   string s = d.docID.Trim(' ');
                         * if (s.Equals("FBIS3-3366"))
                         * {
                         *
                         *  using (StreamWriter sw = new StreamWriter(p+"\\Y.txt"))
                         *  {
                         *      foreach (string ter in termsInCurrDoc.Keys)
                         *      {
                         *          sw.WriteLine(String.Join(",",new string[]{ter , termsInCurrDoc[ter].ToString() }));
                         *      }
                         *
                         *  }
                         * }*/
                        #endregion
                        // add Terms From Doc To dictionary
                        foreach (KeyValuePair <string, int> curr in termsInCurrDoc)
                        {
                            if (!termsInFiles.ContainsKey(curr.Key))
                            {
                                TermInfo t = new TermInfo();
                                t.term = curr.Key;
                                t.locations[d.docID] = curr.Value;
                                t.type    = parser.termType[t.term];
                                t.corpusF = curr.Value;
                                t.docF    = 1;
                                if (parser.nextTerm.ContainsKey(curr.Key))
                                {
                                    t.addNextString(parser.nextTerm[curr.Key]);
                                }
                                t.inFirstThird[d.docID] = parser.termsInFirstThirdOfText[curr.Key];
                                t.inLast10[d.docID]     = parser.termsInLast10OfText[curr.Key];
                                termsInFiles[curr.Key]  = t;
                            }
                            else
                            {
                                termsInFiles[curr.Key].locations[d.docID] = curr.Value;
                                termsInFiles[curr.Key].corpusF           += curr.Value;
                                termsInFiles[curr.Key].docF++;
                                if (parser.nextTerm.ContainsKey(curr.Key))
                                {
                                    termsInFiles[curr.Key].addNextString(parser.nextTerm[curr.Key]);
                                }
                                termsInFiles[curr.Key].inFirstThird[d.docID] = parser.termsInFirstThirdOfText[curr.Key];
                                termsInFiles[curr.Key].inLast10[d.docID]     = parser.termsInLast10OfText[curr.Key];
                            }
                        }

                        docsInDataBase.Add(d);
                        countDocL += d.numTermsInDoc;
                    }
                    idxFile++;
                }
                return(termsInFiles);
            }
            return(null);
        }
예제 #4
0
        /// <summary>
        /// load dictionary to memory
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void loadDictionaryButton_Click(object sender, RoutedEventArgs e)
        {
            string dictPath = PathPostingDictionary.Text;
            bool   stemm    = false;

            if (stemmingCheck.HasContent && stemmingCheck.IsChecked == true)
            {
                stemm = true;
            }
            d    = new Dictionary <string, TermInfo>();
            docs = new Dictionary <string, DocumentInfo>();
            string p = "";

            if (stemm)
            {
                p = dictPath + @"\DictionaryStemm.txt";
            }
            else
            {
                p = dictPath + @"\Dictionary.txt";
            }
            if (!File.Exists(p))
            {
                System.Windows.Forms.MessageBox.Show("Dictionary not found !");
            }
            else
            {
                Thread tDocs = new Thread(() => loadDocs(dictPath, stemm));
                tDocs.Start();
                string[] tempDic = File.ReadAllLines(p);
                for (int i = 0; i < tempDic.Length; i += 2)
                {
                    string[] t    = tempDic[i].Split(',');
                    string   term = t[0];
                    TermInfo ti   = new TermInfo(t[0], int.Parse(t[1]), int.Parse(t[2]), t[3][0]);
                    ti.setPointer(int.Parse(t[4]));

                    t = tempDic[i + 1].Split(',');
                    foreach (string next in t)
                    {
                        ti.nextString.Add(next);
                    }

                    /*   t = tempDic[i + 2].Split(',');
                     * foreach (string next in t)
                     * {
                     *     ti.synForTerm.Add(next);
                     * }*/


                    d[term] = ti;
                }

                /*
                 * foreach (string term in tempDic)
                 * {
                 *  string[] t = term.Split(',');
                 *  TermInfo ti = new TermInfo(t[0], int.Parse(t[1]), int.Parse(t[2]), t[3][0]);
                 *  ti.setPointer(int.Parse(t[4]));
                 *  d[t[0]] = ti;
                 * }*/
                tDocs.Join();

                System.Windows.Forms.MessageBox.Show("load data succeeded");
            }
        }