Ejemplo n.º 1
0
        public CompressedIndex Compress(PositionalIndex uncompressedIndex)
        {
            CompressedIndex compressedIndex = CopyDataToCompressedIndex(uncompressedIndex);

            var mapping = new List <RawDataMapping>();

            foreach (var tokenPosting in compressedIndex.PostingsByToken)
            {
                mapping.AddRange(tokenPosting.Value.RawData.Select(list => new RawDataMapping(tokenPosting.Key, list)));
            }

            new CompressUtility().CompressIntList(mapping);


            foreach (var tokenPosting in compressedIndex.PostingsByToken)
            {
                var postingCompressedStrings = mapping.Where(x => x.Token == tokenPosting.Key)
                                               .OrderBy(x => x.Id)
                                               .Select(x => x.CompressedData)
                                               .ToList();

                tokenPosting.Value.PlaceCompressedData(postingCompressedStrings);
            }

            return(compressedIndex);
        }
Ejemplo n.º 2
0
        public PositionalIndex Decompress(CompressedIndex compressedIndex)
        {
            //return compressedIndex.Decompress();

            //foreach (var item in compressedIndex.PostingsByTokens)
            //    result.(item.Key, item.Value);

            var mapping = new List <RawDataMapping>();

            foreach (var compressedPosting in compressedIndex.PostingsByToken)
            {
                mapping.Add(new RawDataMapping(compressedPosting.Key, compressedPosting.Value.CompressedListOfDocumentIds));
                mapping.AddRange(compressedPosting.Value.OccurrencesList.Select(x => new RawDataMapping(compressedPosting.Key, x)));
            }

            new CompressUtility().DecompressList(mapping);

            var result = new PositionalIndex();

            foreach (var tokenPosting in compressedIndex.PostingsByToken)
            {
                var postingIntList = mapping.Where(x => x.Token == tokenPosting.Key)
                                     .OrderBy(x => x.Id)
                                     .Select(x => x.RawData)
                                     .ToList();

                var decompressedPosting = tokenPosting.Value.Decompress(postingIntList);

                result.AddDecompressedPosting(decompressedPosting);
            }

            return(result);
        }
Ejemplo n.º 3
0
        public void CreateVector(PositionalIndex TitleIndex, PositionalIndex ContentIndex)
        {
            if (TitleIndex != null)
            {
                TitleVector = new TfIdfVector(TitleIndex, GetTokens(TitleTokens), Document.Id);
            }

            ContentVector = new TfIdfVector(ContentIndex, GetTokens(ContentTokens), Document.Id);
        }
Ejemplo n.º 4
0
        public TfIdfVector(PositionalIndex index, List <string> tokens, int docId)
        {
            IdfByToken = tokens.Select(x => (token: x, idf: index.Idf(x)))
                         .ToDictionary(x => x.token, x => x.idf);

            TfByToken = tokens.Select(x => (token: x, tf: index.GetOccurrence(x, docId).Positions.Count))
                        .ToDictionary(x => x.token, x => x.tf);

            SetVector();
        }
Ejemplo n.º 5
0
        private CompressedIndex CopyDataToCompressedIndex(PositionalIndex uncompressedIndex)
        {
            var result = new CompressedIndex();

            foreach (var item in uncompressedIndex.PostingsByToken)
            //Parallel.ForEach(uncompressedIndex.IndexByTokens, new ParallelOptions() { MaxDegreeOfParallelism = 50 }, (item) =>
            {
                result.AddPosting(item.Key, item.Value);
            }

            return(result);
        }
Ejemplo n.º 6
0
        public QueryExecuter(string rawQuery, PositionalIndex index)
        {
            RawQuery = rawQuery;
            Index    = index;

            var preprocessor = new QueryProcessor(RawQuery);

            preprocessor.Preprocess();
            Query            = preprocessor.Tokens.Distinct().ToList();
            QueryTokenCounts = Query.Select(x => (token: x, count: preprocessor.Tokens.Count(y => y == x)))
                               .ToDictionary(x => x.token, x => x.count);
        }
Ejemplo n.º 7
0
        public void Process()
        {
            if (HasTitle)
            {
                TitleIndex = new PositionalIndex();
            }

            ContentIndex = new PositionalIndex();

            var pureDocs = Documents.Values.Select(x => x.Document).ToList();

            Dictionary <int, DocumentTokens> titleTokensBulk = null;

            if (HasTitle)
            {
                var titlePreprocessClient = new PreprocessClient(true, true);
                titleTokensBulk = titlePreprocessClient.GetTokens(pureDocs).ToDictionary(x => x.DocumentId);
            }

            var contentPreprocessClient = new PreprocessClient(false, true);
            var contentTokensBulk       = contentPreprocessClient.GetTokens(pureDocs).ToDictionary(x => x.DocumentId);


            foreach (var docId in contentTokensBulk.Keys.OrderBy(x => x))
            {
                if (HasTitle)
                {
                    TitleIndex.AddDocumentToIndex(titleTokensBulk[docId]);
                }

                ContentIndex.AddDocumentToIndex(contentTokensBulk[docId]);

                if (HasTitle)
                {
                    Documents[docId].SetTokens(titleTokensBulk[docId], contentTokensBulk[docId]);
                }
                else
                {
                    Documents[docId].SetTokens(null, contentTokensBulk[docId]);
                }
            }

            foreach (var document in Documents.Values)
            {
                document.CreateVector(TitleIndex, ContentIndex);
            }
        }
Ejemplo n.º 8
0
        void RunPhase1()
        {
            MainIndex = new PositionalIndex();

            IndexEnglishFiles();
            //IndexPersianFiles();

            var tokenRepeats = MainIndex.GetAllTokensRepeats()
                               .OrderByDescending(x => x.repeatCount)
                               .ToList();

            Console.WriteLine(string.Join(",", tokenRepeats));
            Console.WriteLine("Unique Token count is: " + tokenRepeats.Count);


            WriteInFile(MainIndex, "MainIndex");
            GenerateBiword();

            while (true)
            {
                Console.WriteLine("Enter your command: [query|proximity(windowSize)|and|compress|decompress|info(word)]");
                var command      = Console.ReadLine();
                var commandWords = command.Split(' ');

                if (command == "and" || command == "a" || command == "q" || command == "query" || commandWords.First() == "p" || commandWords.First() == "proximity")
                {
                    Console.WriteLine("Enter your query: ");
                    var query = Console.ReadLine();

                    if (string.IsNullOrEmpty(query))
                    {
                        Console.WriteLine("Your query is empty.");
                        continue;
                    }

                    var queryExecuter = new QueryExecuter(query, MainIndex);

                    List <SearchResult> result = null;
                    if (command == "and" || command == "a")
                    {
                        result = queryExecuter.ExecuteAndSearch();
                    }
                    else
                    {
                        if (commandWords.First() == "p" || commandWords.First() == "proximity")
                        {
                            result = queryExecuter.ExecuteTfIdfSearch(Convert.ToInt32(commandWords[1]));
                        }
                        else
                        {
                            result = queryExecuter.ExecuteTfIdfSearch();
                        }
                    }



                    Console.WriteLine("Query after preprocess: " + JsonConvert.SerializeObject(queryExecuter.Query));
                    Console.WriteLine(string.Join(Environment.NewLine, result));
                    Console.WriteLine("---------------------------------" + Environment.NewLine);

                    Console.WriteLine("Enter classification filter: [1|2|3|4|no filter(Enter)]");
                    var  classFilterLine = Console.ReadLine();
                    byte classFilter     = Convert.ToByte(classFilterLine);

                    queryExecuter = new QueryExecuter(query, MainIndex);
                    result        = queryExecuter.ExecuteTfIdfSearch(null, classFilter);
                    Console.WriteLine(string.Join(Environment.NewLine, result));
                    Console.WriteLine("---------------------------------" + Environment.NewLine);
                }
                else if (command.StartsWith("info"))
                {
                    var word = command.Split(' ')[1];

                    Console.WriteLine(MainIndex.GetWordInfo(word));
                }
                else if (command.Contains("compress"))
                {
                    Console.WriteLine("Enter compress mode: [gamma|varbyte]");
                    var modeStr = Console.ReadLine();
                    CompressUtility.Mode = (CompressMode)Enum.Parse(typeof(CompressMode), modeStr);

                    if (command.StartsWith("de"))
                    {
                        Decompress();
                    }
                    else
                    {
                        Compress();
                    }
                }
            }
        }