public CompressedIndex Compress(PositionalIndex uncompressedIndex) { CompressedIndex compressedIndex = CopyDataToCompressedIndex(uncompressedIndex); var mapping = new List <RawDataMapping>(); foreach (var tokenPosting in compressedIndex.PostingsByToken) { mapping.AddRange(tokenPosting.Value.RawData.Select(list => new RawDataMapping(tokenPosting.Key, list))); } new CompressUtility().CompressIntList(mapping); foreach (var tokenPosting in compressedIndex.PostingsByToken) { var postingCompressedStrings = mapping.Where(x => x.Token == tokenPosting.Key) .OrderBy(x => x.Id) .Select(x => x.CompressedData) .ToList(); tokenPosting.Value.PlaceCompressedData(postingCompressedStrings); } return(compressedIndex); }
public PositionalIndex Decompress(CompressedIndex compressedIndex) { //return compressedIndex.Decompress(); //foreach (var item in compressedIndex.PostingsByTokens) // result.(item.Key, item.Value); var mapping = new List <RawDataMapping>(); foreach (var compressedPosting in compressedIndex.PostingsByToken) { mapping.Add(new RawDataMapping(compressedPosting.Key, compressedPosting.Value.CompressedListOfDocumentIds)); mapping.AddRange(compressedPosting.Value.OccurrencesList.Select(x => new RawDataMapping(compressedPosting.Key, x))); } new CompressUtility().DecompressList(mapping); var result = new PositionalIndex(); foreach (var tokenPosting in compressedIndex.PostingsByToken) { var postingIntList = mapping.Where(x => x.Token == tokenPosting.Key) .OrderBy(x => x.Id) .Select(x => x.RawData) .ToList(); var decompressedPosting = tokenPosting.Value.Decompress(postingIntList); result.AddDecompressedPosting(decompressedPosting); } return(result); }
public void CreateVector(PositionalIndex TitleIndex, PositionalIndex ContentIndex) { if (TitleIndex != null) { TitleVector = new TfIdfVector(TitleIndex, GetTokens(TitleTokens), Document.Id); } ContentVector = new TfIdfVector(ContentIndex, GetTokens(ContentTokens), Document.Id); }
public TfIdfVector(PositionalIndex index, List <string> tokens, int docId) { IdfByToken = tokens.Select(x => (token: x, idf: index.Idf(x))) .ToDictionary(x => x.token, x => x.idf); TfByToken = tokens.Select(x => (token: x, tf: index.GetOccurrence(x, docId).Positions.Count)) .ToDictionary(x => x.token, x => x.tf); SetVector(); }
private CompressedIndex CopyDataToCompressedIndex(PositionalIndex uncompressedIndex) { var result = new CompressedIndex(); foreach (var item in uncompressedIndex.PostingsByToken) //Parallel.ForEach(uncompressedIndex.IndexByTokens, new ParallelOptions() { MaxDegreeOfParallelism = 50 }, (item) => { result.AddPosting(item.Key, item.Value); } return(result); }
public QueryExecuter(string rawQuery, PositionalIndex index) { RawQuery = rawQuery; Index = index; var preprocessor = new QueryProcessor(RawQuery); preprocessor.Preprocess(); Query = preprocessor.Tokens.Distinct().ToList(); QueryTokenCounts = Query.Select(x => (token: x, count: preprocessor.Tokens.Count(y => y == x))) .ToDictionary(x => x.token, x => x.count); }
public void Process() { if (HasTitle) { TitleIndex = new PositionalIndex(); } ContentIndex = new PositionalIndex(); var pureDocs = Documents.Values.Select(x => x.Document).ToList(); Dictionary <int, DocumentTokens> titleTokensBulk = null; if (HasTitle) { var titlePreprocessClient = new PreprocessClient(true, true); titleTokensBulk = titlePreprocessClient.GetTokens(pureDocs).ToDictionary(x => x.DocumentId); } var contentPreprocessClient = new PreprocessClient(false, true); var contentTokensBulk = contentPreprocessClient.GetTokens(pureDocs).ToDictionary(x => x.DocumentId); foreach (var docId in contentTokensBulk.Keys.OrderBy(x => x)) { if (HasTitle) { TitleIndex.AddDocumentToIndex(titleTokensBulk[docId]); } ContentIndex.AddDocumentToIndex(contentTokensBulk[docId]); if (HasTitle) { Documents[docId].SetTokens(titleTokensBulk[docId], contentTokensBulk[docId]); } else { Documents[docId].SetTokens(null, contentTokensBulk[docId]); } } foreach (var document in Documents.Values) { document.CreateVector(TitleIndex, ContentIndex); } }
void RunPhase1() { MainIndex = new PositionalIndex(); IndexEnglishFiles(); //IndexPersianFiles(); var tokenRepeats = MainIndex.GetAllTokensRepeats() .OrderByDescending(x => x.repeatCount) .ToList(); Console.WriteLine(string.Join(",", tokenRepeats)); Console.WriteLine("Unique Token count is: " + tokenRepeats.Count); WriteInFile(MainIndex, "MainIndex"); GenerateBiword(); while (true) { Console.WriteLine("Enter your command: [query|proximity(windowSize)|and|compress|decompress|info(word)]"); var command = Console.ReadLine(); var commandWords = command.Split(' '); if (command == "and" || command == "a" || command == "q" || command == "query" || commandWords.First() == "p" || commandWords.First() == "proximity") { Console.WriteLine("Enter your query: "); var query = Console.ReadLine(); if (string.IsNullOrEmpty(query)) { Console.WriteLine("Your query is empty."); continue; } var queryExecuter = new QueryExecuter(query, MainIndex); List <SearchResult> result = null; if (command == "and" || command == "a") { result = queryExecuter.ExecuteAndSearch(); } else { if (commandWords.First() == "p" || commandWords.First() == "proximity") { result = queryExecuter.ExecuteTfIdfSearch(Convert.ToInt32(commandWords[1])); } else { result = queryExecuter.ExecuteTfIdfSearch(); } } Console.WriteLine("Query after preprocess: " + JsonConvert.SerializeObject(queryExecuter.Query)); Console.WriteLine(string.Join(Environment.NewLine, result)); Console.WriteLine("---------------------------------" + Environment.NewLine); Console.WriteLine("Enter classification filter: [1|2|3|4|no filter(Enter)]"); var classFilterLine = Console.ReadLine(); byte classFilter = Convert.ToByte(classFilterLine); queryExecuter = new QueryExecuter(query, MainIndex); result = queryExecuter.ExecuteTfIdfSearch(null, classFilter); Console.WriteLine(string.Join(Environment.NewLine, result)); Console.WriteLine("---------------------------------" + Environment.NewLine); } else if (command.StartsWith("info")) { var word = command.Split(' ')[1]; Console.WriteLine(MainIndex.GetWordInfo(word)); } else if (command.Contains("compress")) { Console.WriteLine("Enter compress mode: [gamma|varbyte]"); var modeStr = Console.ReadLine(); CompressUtility.Mode = (CompressMode)Enum.Parse(typeof(CompressMode), modeStr); if (command.StartsWith("de")) { Decompress(); } else { Compress(); } } } }