public BM25Scorer(TermIndex index, IndexMetadata indexMetadata) { this.index = index; this.indexMetadata = indexMetadata; this.Lavg = indexMetadata.TokenCount / indexMetadata.CollectionLengthInDocuments; }
public void before() { MemoryStream termIndexStream = new MemoryStream(); using (FileIndexWriter<string, IList<Posting>> indexWriter = new FileIndexWriter<string, IList<Posting>>( new StringEncoder(), new PostingListEncoder(), termIndexStream)) { this.postingsWithBar = new List<Posting>(); postingsWithBar.Add(new Posting(0, 1)); postingsWithBar.Add(new Posting(1, 2)); indexWriter.Add("bar", postingsWithBar); this.postingsWithFoo = new List<Posting>(); postingsWithFoo.Add(new Posting(0, 4)); indexWriter.Add("foo", postingsWithFoo); indexWriter.WriteOut(); } this.index = new TermIndex(termIndexStream); MemoryStream metadataStream = new MemoryStream(); using (CollectionMetadataWriter metadataWriter = new CollectionMetadataWriter(metadataStream)) { metadataWriter.AddDocumentInfo(0, new DocumentInfo("http://www.example.com/index.html", "Example", 100, "", null)); metadataWriter.AddDocumentInfo(1, new DocumentInfo("http://www.example.com/menu.html", "Example", 300, "", null)); metadataWriter.WriteOut(); } this.metadata = new IndexMetadata(metadataStream); }
public void AddIndex( string indexName, string indexModuleName, IndexMultiKey imk, QueryPlanIndexItem optionalQueryPlanIndexItem) { IndexMetadata.AddIndexExplicit(false, imk, indexName, indexModuleName, optionalQueryPlanIndexItem, ""); }
public IQueryable <IndexMetadata> GetIndexMetadata() { var sqlDefinition = getSQLDefinition(typeof(IndexMetadata).GetCustomAttribute <SQLServerMetadataAttribute>().SQLDefinitionResource); return(IndexMetadata .FromSql(sqlDefinition) .AsQueryable()); }
public static void Main(string[] args) { if (args.Length != 3) { Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath> <metadatafilepath"); Console.ReadLine(); return; } string directory = args[0]; string indexFilePath = args[1]; string metadataFilePath = args[2]; Console.WriteLine("Welcome to Spimi!"); DirectoryInfo directoryInfo = new DirectoryInfo(directory); if (!directoryInfo.Exists) { Console.WriteLine("Directory could not be found"); return; } using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create)) { // Index the corpus Console.WriteLine("Parsing corpus and creating index blocks..."); SpimiIndexer indexer = new SpimiIndexer( new BasicLexer(), new HtmlParser(), indexFileStream, metadataFileStream); WebCrawler crawler = new WebCrawler(directoryInfo); foreach (WebDocument doc in crawler.GetDocuments()) { Stream stream = doc.Open(); indexer.Index(doc.Uri, stream); stream.Close(); } // 2- Build the final index Console.WriteLine("Merging blocks into one index..."); indexer.WriteOut(); IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); // 3- Query the index Console.WriteLine("Done! Please use one of the following commands: \n/query <term1> <term2>\n/cluster <k>\n"); QueryCli cli = new QueryCli(indexMetadata, index); cli.Run(); } } }
/// <summary> /// Adds a Table Index to the Table metadata /// </summary> /// <returns>The index.</returns> /// <param name="name">Name.</param> /// <param name="pk">Pk.</param> /// <param name="sk">Sk.</param> public IndexMetadata AddIndex(string name, string pk, string sk) { var index = new IndexMetadata() { Name = name, PartitionKeyName = pk, SortKeyName = sk }; Indicies.Add(name, index); return(index); }
public NamedWindowMetaData Copy() { return new NamedWindowMetaData( EventType, NamedWindowModuleName, ContextName, Uniqueness, IsChildBatching, IsEnableIndexShare, OptionalEventTypeAs, IsVirtualDataWindow, IndexMetadata.Copy()); }
private static List <HighlightObject> HightlightWords(IndexMetadata input, string keyword) { List <PdfMetadata> words = new List <PdfMetadata>(); List <List <PdfMetadata> > wordPerPage = new List <List <PdfMetadata> >(); List <HighlightObject> list = new List <HighlightObject>(); int LastPage = 1; Debug.WriteLine(JsonConvert.SerializeObject(input.ListOfWords)); foreach (var word in input.ListOfWords) { if (keyword.ToLower() == word.Text.ToLower().TrimStart().TrimEnd()) { if (word.page == LastPage) { words.Add(word); } else { LastPage = word.page; wordPerPage.Add(new List <PdfMetadata>(words)); words.Clear(); words.Add(word); } } } //adding last list of words (last page) wordPerPage.Add(new List <PdfMetadata>(words)); foreach (var pages in wordPerPage) { foreach (var item in pages) { list.Add(new HighlightObject { Metadata = input, HighlightedWords = ConvertWord2BoundingBox(item), Keyword = keyword, PageNumber = item.page }); } } return(list); }
public TableMetaData Copy() { return new TableMetaData( TableName, TableModuleName, TableVisibility, OptionalContextName, OptionalContextVisibility, OptionalContextModule, InternalEventType, PublicEventType, KeyColumns, KeyTypes, KeyColNums, Columns, NumMethodAggs, KeyIndexMultiKey, IndexMetadata.Copy()); }
//TODO: if there is more than one word in the same cutImage, create only one object private static List <SampleObject> CreateSampleObject(IndexMetadata result, string keyword) { List <SampleObject> objects = new List <SampleObject>(); //Get all Highlighted Object per page List <HighlightObject> hos = HightlightWords(result, keyword); //Extract page and process image to construct SampleObject foreach (var ho in hos) { SampleObject so = new SampleObject { HighlightObject = ho, Metadata = result, ImageUri = GetPageImageUri(ho.Metadata.PDFURI, ho.PageNumber) }; objects.Add(so); } return(objects); }
public IList<ClusterResult> Cluster(int k) { using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open)) { IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); KMeansClusterFinder clusterFinder = new KMeansClusterFinder(indexMetadata, index); IList<long> allDocIds = indexMetadata.GetDocumentIds(); long[][] clusters = clusterFinder.Cluster(allDocIds, k); IList<ClusterResult> clusterResults = new List<ClusterResult>(); foreach (long[] cluster in clusters) { // Get the term frequencies in the collection IEnumerable<DocumentInfo> clusterDocuments = indexMetadata.GetDocuments(cluster); TermVector sum = new TermVector(); foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector)) { sum += vector; } IEnumerable<string> topTerms = TermVector.GetCentroid(indexMetadata.GetDocuments(cluster) .Select(docInfo => docInfo.TermVector)) .GetNonZeroDimensions() .OrderByDescending(term => sum.GetDimensionLength(term) * this.GetIdf(index, indexMetadata, term)) .Take(6); clusterResults.Add(new ClusterResult(topTerms.ToList(), clusterDocuments.Select(docInfo => docInfo.Uri).ToList())); } return clusterResults; } } }
public void Init() { // add index multi-key for implicit primary-key index if (KeyColumns == null || KeyColumns.Length == 0) { return; } var props = new IndexedPropDesc[KeyColumns.Length]; for (var i = 0; i < props.Length; i++) { props[i] = new IndexedPropDesc(KeyColumns[i], KeyTypes[i]); } KeyIndexMultiKey = new IndexMultiKey(true, props, new IndexedPropDesc[0], null); try { IndexMetadata.AddIndexExplicit(true, KeyIndexMultiKey, TableName, TableModuleName, null, ""); } catch (ExprValidationException e) { throw new EPException("Failed to add primary key index: " + e.Message, e); } }
public TfIdfScorer(TermIndex index, IndexMetadata indexMetadata) { this.index = index; this.indexMetadata = indexMetadata; }
public BestMatchRanker(TermIndex index, IndexMetadata indexMetadata) { this.index = index; this.indexMetadata = indexMetadata; this.scorer = new BM25Scorer(index, indexMetadata); }
public IndexingStats Index(string site) { IndexingStats result = new IndexingStats(); DirectoryInfo directoryInfo = new DirectoryInfo(directory + site); if (!directoryInfo.Exists) { return result; } DateTime start = DateTime.Now; using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create)) { // Index the corpus SpimiIndexer indexer = new SpimiIndexer( new BasicLexer(), new HtmlParser(), indexFileStream, metadataFileStream); WebCrawler crawler = new WebCrawler(directoryInfo); foreach (WebDocument doc in crawler.GetDocuments()) { Stream stream = doc.Open(); indexer.Index(doc.Uri, stream); stream.Close(); } indexer.WriteOut(); IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); result.CollectionSize = indexMetadata.CollectionLengthInDocuments; } } DateTime end = DateTime.Now; result.IndexingTime = (end - start).TotalMilliseconds; return result; }
private double GetIdf(TermIndex index, IndexMetadata metadata, string term) { double idf = Math.Log(((double)metadata.CollectionLengthInDocuments) / index[term].Count); return idf; }
public IList<QueryResult> Query(string query, RankingMode rankingMode) { using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open)) { IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); IList<long> results = queryEngine.Query(query.ToLower(), rankingMode); IList<QueryResult> queryResults = new List<QueryResult>(); int i = 1; Console.WriteLine("rank\tscore\ttitle"); foreach (long docId in results.Take(500)) { DocumentInfo docInfo; if (indexMetadata.TryGetDocumentInfo(docId, out docInfo)) { QueryResult res = new QueryResult() { Title = docInfo.Title, Uri = docInfo.Uri, Score = queryEngine.Scores[docId] }; queryResults.Add(res); } else { Console.WriteLine("Found document id in posting list that wasn't indexed in metadata: " + docId); } } return queryResults; } } }
public KMeansClusterFinder(IndexMetadata metadata, TermIndex index) { this.metadata = metadata; this.index = index; }
public QueryCli(IndexMetadata metadata, TermIndex index) { this.metadata = metadata; this.index = index; this.queryEngine = new QueryEngine(index, metadata); }
public TfIdfRanker(TermIndex index, IndexMetadata indexMetadata) { this.index = index; this.indexMetadata = indexMetadata; this.scorer = new TfIdfScorer(index, indexMetadata); }
public ReutersReader(string directory, ReutersParser parser, IndexMetadata metadata) { this.directory = directory; this.parser = parser; this.metadata = metadata; }