Exemple #1
0
        public void crawler(Object seed)
        {
            List <string> all_links = new List <string>();

            all_links.Add((string)seed); //seed
            List <string> all_documents = new List <string>();
            AllPage       page;
            IRdbEntities  db = new IRdbEntities();

            for (int i = 0; i < 5000 && i < all_links.Count; i++)
            {
                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc = load(all_links[i]);
                if (doc != null)
                {
                    string myBody = parseText(doc);
                    if (myBody != null)   //no body
                    {
                        all_documents.Add(myBody);
                        List <string> links = getLinks(doc, all_links[i]);
                        all_links      = new_list(all_links, links);
                        page           = new AllPage();
                        page.linkUrl   = all_links[i];
                        page.mycontent = myBody;
                        db.AllPages.Add(page);
                        db.SaveChanges();
                        if (i == all_links.Count && links.Count == 0)//last Page in Crawler
                        {
                            break;
                        }
                    }
                }
            }
            db.SaveChanges();
        }
Exemple #2
0
        void AllDistictItemskGram()
        {
            IRdbEntities      db     = new IRdbEntities();
            List <kGramIndex> kGrams = db.kGramIndexes.ToList();

            foreach (var kgrm in kGrams)
            {
                DistinctkGramsIndex k = new DistinctkGramsIndex();
                k.k_gram = kgrm.k_gram;
                String[] words = kgrm.terms.Split(',').Distinct().ToArray();
                Array.Sort(words);
                k.terms = string.Join(",", words);
                db.DistinctkGramsIndexes.Add(k);
                db.SaveChanges();
            }
            db.SaveChanges();
        }
Exemple #3
0
        void DistictItemsSoundex()
        {
            IRdbEntities        db     = new IRdbEntities();
            List <SoundexIndex> sounds = db.SoundexIndexes.ToList();

            foreach (var sound in sounds)
            {
                DistinctSoundexIndex s = new DistinctSoundexIndex();
                s.soundex = sound.soundex;
                String[] words = sound.items.Split(',').Distinct().ToArray();
                Array.Sort(words);
                s.items = string.Join(",", words);
                db.DistinctSoundexIndexes.Add(s);
                db.SaveChanges();
            }
            db.SaveChanges();
        }
Exemple #4
0
        public void fillSoundexIndex()
        {
            IRdbEntities db = new IRdbEntities();

            SoundexIndex soun;
            var          l   = SoundexIndex.OrderBy(key => key.Key);
            var          dic = l.ToDictionary((keyItem) => keyItem.Key, (valueItem) => valueItem.Value);

            foreach (var sound in dic)
            {
                string combindedSoundes = string.Join(",", sound.Value.ToArray());
                soun         = new SoundexIndex();
                soun.soundex = sound.Key;
                soun.items   = combindedSoundes;
                db.SoundexIndexes.Add(soun);
                db.SaveChanges();
            }
        }
Exemple #5
0
        public void fillBgramIndex()
        {
            IRdbEntities db = new IRdbEntities();
            kGramIndex   kG;

            var l   = BgramIndex.OrderBy(key => key.Key);
            var dic = l.ToDictionary((keyItem) => keyItem.Key, (valueItem) => valueItem.Value);

            foreach (var gram in dic)
            {
                string combindedGrams = string.Join(",", gram.Value.ToArray());
                kG        = new kGramIndex();
                kG.k_gram = gram.Key;
                kG.terms  = combindedGrams;
                db.kGramIndexes.Add(kG);
                db.SaveChanges();
            }
        }
Exemple #6
0
        public void fillInvertedIndex()
        {
            IRdbEntities  db = new IRdbEntities();
            InvertedIndex inver;

            invertedIndex = invertedIndex.OrderBy(x => x.docID).ThenBy(x => x.mToken).ToList();
            foreach (var inv in invertedIndex)
            {
                try {
                    string combindedpos = string.Join(",", inv.docPositions.ToArray());
                    inver           = new InvertedIndex();
                    inver.docID     = inv.docID;
                    inver.frequency = inv.Freq;
                    inver.term      = inv.mToken;
                    inver.positions = combindedpos;
                    db.InvertedIndexes.Add(inver);
                    db.SaveChanges();
                }
                catch (Exception e) { };
            }
        }
Exemple #7
0
        public void Indexing()
        {
            IRdbEntities db = new IRdbEntities();


            AllDocuments = db.AllPages.ToList();

            startIndexing();

            Thread thread20 = new Thread(fillInvertedIndex);

            thread20.Start();
            Thread thread21 = new Thread(fillBgramIndex);

            thread21.Start();
            Thread thread22 = new Thread(fillSoundexIndex);

            thread21.Start();
            thread22.Start();
            thread21.Join();
            thread22.Join();
            DistictItemsSoundex();
            AllDistictItemskGram();
        }