Example #1
0
        public BM25Scorer(TermIndex index, IndexMetadata indexMetadata)
        {
            this.index = index;
            this.indexMetadata = indexMetadata;

            this.Lavg = indexMetadata.TokenCount / indexMetadata.CollectionLengthInDocuments;
        }
Example #2
0
        public void before()
        {
            MemoryStream termIndexStream = new MemoryStream();
            using (FileIndexWriter<string, IList<Posting>> indexWriter = new FileIndexWriter<string, IList<Posting>>(
                new StringEncoder(), new PostingListEncoder(), termIndexStream))
            {
                this.postingsWithBar = new List<Posting>();
                postingsWithBar.Add(new Posting(0, 1));
                postingsWithBar.Add(new Posting(1, 2));
                indexWriter.Add("bar", postingsWithBar);

                this.postingsWithFoo = new List<Posting>();
                postingsWithFoo.Add(new Posting(0, 4));
                indexWriter.Add("foo", postingsWithFoo);

                indexWriter.WriteOut();
            }
            this.index = new TermIndex(termIndexStream);

            MemoryStream metadataStream = new MemoryStream();
            using (CollectionMetadataWriter metadataWriter = new CollectionMetadataWriter(metadataStream))
            {
                metadataWriter.AddDocumentInfo(0, new DocumentInfo("http://www.example.com/index.html", "Example", 100, "", null));
                metadataWriter.AddDocumentInfo(1, new DocumentInfo("http://www.example.com/menu.html", "Example", 300, "", null));
                metadataWriter.WriteOut();
            }

            this.metadata = new IndexMetadata(metadataStream);
        }
Example #3
0
 public void AddIndex(
     string indexName,
     string indexModuleName,
     IndexMultiKey imk,
     QueryPlanIndexItem optionalQueryPlanIndexItem)
 {
     IndexMetadata.AddIndexExplicit(false, imk, indexName, indexModuleName, optionalQueryPlanIndexItem, "");
 }
Example #4
0
        public IQueryable <IndexMetadata> GetIndexMetadata()
        {
            var sqlDefinition = getSQLDefinition(typeof(IndexMetadata).GetCustomAttribute <SQLServerMetadataAttribute>().SQLDefinitionResource);

            return(IndexMetadata
                   .FromSql(sqlDefinition)
                   .AsQueryable());
        }
Example #5
0
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath> <metadatafilepath");
                Console.ReadLine();
                return;
            }
            string directory = args[0];
            string indexFilePath = args[1];
            string metadataFilePath = args[2];

            Console.WriteLine("Welcome to Spimi!");

            DirectoryInfo directoryInfo = new DirectoryInfo(directory);
            if (!directoryInfo.Exists)
            {
                Console.WriteLine("Directory could not be found");
                return;
            }

            using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create))
            {
                using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create))
                {
                    // Index the corpus
                    Console.WriteLine("Parsing corpus and creating index blocks...");
                    SpimiIndexer indexer = new SpimiIndexer(
                        new BasicLexer(),
                        new HtmlParser(),
                        indexFileStream,
                        metadataFileStream);

                    WebCrawler crawler = new WebCrawler(directoryInfo);
                    foreach (WebDocument doc in crawler.GetDocuments())
                    {
                        Stream stream = doc.Open();
                        indexer.Index(doc.Uri, stream);
                        stream.Close();
                    }

                    // 2- Build the final index
                    Console.WriteLine("Merging blocks into one index...");
                    indexer.WriteOut();

                    IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                    TermIndex index = new TermIndex(indexFileStream);
                    QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                    // 3- Query the index
                    Console.WriteLine("Done! Please use one of the following commands: \n/query <term1> <term2>\n/cluster <k>\n");

                    QueryCli cli = new QueryCli(indexMetadata, index);
                    cli.Run();
                }
            }
        }
Example #6
0
        /// <summary>
        /// Adds a Table Index to the Table metadata
        /// </summary>
        /// <returns>The index.</returns>
        /// <param name="name">Name.</param>
        /// <param name="pk">Pk.</param>
        /// <param name="sk">Sk.</param>
        public IndexMetadata AddIndex(string name, string pk, string sk)
        {
            var index = new IndexMetadata()
            {
                Name             = name,
                PartitionKeyName = pk,
                SortKeyName      = sk
            };

            Indicies.Add(name, index);
            return(index);
        }
Example #7
0
 public NamedWindowMetaData Copy()
 {
     return new NamedWindowMetaData(
         EventType,
         NamedWindowModuleName,
         ContextName,
         Uniqueness,
         IsChildBatching,
         IsEnableIndexShare,
         OptionalEventTypeAs,
         IsVirtualDataWindow,
         IndexMetadata.Copy());
 }
Example #8
0
        private static List <HighlightObject> HightlightWords(IndexMetadata input, string keyword)
        {
            List <PdfMetadata>         words       = new List <PdfMetadata>();
            List <List <PdfMetadata> > wordPerPage = new List <List <PdfMetadata> >();
            List <HighlightObject>     list        = new List <HighlightObject>();
            int LastPage = 1;

            Debug.WriteLine(JsonConvert.SerializeObject(input.ListOfWords));

            foreach (var word in input.ListOfWords)
            {
                if (keyword.ToLower() == word.Text.ToLower().TrimStart().TrimEnd())
                {
                    if (word.page == LastPage)
                    {
                        words.Add(word);
                    }
                    else
                    {
                        LastPage = word.page;
                        wordPerPage.Add(new List <PdfMetadata>(words));
                        words.Clear();
                        words.Add(word);
                    }
                }
            }

            //adding last list of words (last page)
            wordPerPage.Add(new List <PdfMetadata>(words));

            foreach (var pages in wordPerPage)
            {
                foreach (var item in pages)
                {
                    list.Add(new HighlightObject
                    {
                        Metadata         = input,
                        HighlightedWords = ConvertWord2BoundingBox(item),
                        Keyword          = keyword,
                        PageNumber       = item.page
                    });
                }
            }

            return(list);
        }
Example #9
0
 public TableMetaData Copy()
 {
     return new TableMetaData(
         TableName,
         TableModuleName,
         TableVisibility,
         OptionalContextName,
         OptionalContextVisibility,
         OptionalContextModule,
         InternalEventType,
         PublicEventType,
         KeyColumns,
         KeyTypes,
         KeyColNums,
         Columns,
         NumMethodAggs,
         KeyIndexMultiKey,
         IndexMetadata.Copy());
 }
Example #10
0
        //TODO: if there is more than one word in the same cutImage, create only one object
        private static List <SampleObject> CreateSampleObject(IndexMetadata result, string keyword)
        {
            List <SampleObject> objects = new List <SampleObject>();

            //Get all Highlighted Object per page
            List <HighlightObject> hos = HightlightWords(result, keyword);

            //Extract page and process image to construct SampleObject
            foreach (var ho in hos)
            {
                SampleObject so = new SampleObject
                {
                    HighlightObject = ho,
                    Metadata        = result,
                    ImageUri        = GetPageImageUri(ho.Metadata.PDFURI, ho.PageNumber)
                };
                objects.Add(so);
            }
            return(objects);
        }
Example #11
0
    public IList<ClusterResult> Cluster(int k)
    {
        using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open))
        {
            using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open))
            {
                IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                TermIndex index = new TermIndex(indexFileStream);
                QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                KMeansClusterFinder clusterFinder = new KMeansClusterFinder(indexMetadata, index);
                IList<long> allDocIds = indexMetadata.GetDocumentIds();
                long[][] clusters = clusterFinder.Cluster(allDocIds, k);

                IList<ClusterResult> clusterResults = new List<ClusterResult>();

                foreach (long[] cluster in clusters)
                {
                    // Get the term frequencies in the collection
                    IEnumerable<DocumentInfo> clusterDocuments = indexMetadata.GetDocuments(cluster);
                    TermVector sum = new TermVector();
                    foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector))
                    {
                        sum += vector;
                    }

                    IEnumerable<string> topTerms =
                        TermVector.GetCentroid(indexMetadata.GetDocuments(cluster)
                            .Select(docInfo => docInfo.TermVector))
                        .GetNonZeroDimensions()
                        .OrderByDescending(term => sum.GetDimensionLength(term) * this.GetIdf(index, indexMetadata, term))
                        .Take(6);

                    clusterResults.Add(new ClusterResult(topTerms.ToList(),
                        clusterDocuments.Select(docInfo => docInfo.Uri).ToList()));
                }

                return clusterResults;
            }
        }
    }
Example #12
0
        public void Init()
        {
            // add index multi-key for implicit primary-key index
            if (KeyColumns == null || KeyColumns.Length == 0)
            {
                return;
            }

            var props = new IndexedPropDesc[KeyColumns.Length];
            for (var i = 0; i < props.Length; i++)
            {
                props[i] = new IndexedPropDesc(KeyColumns[i], KeyTypes[i]);
            }

            KeyIndexMultiKey = new IndexMultiKey(true, props, new IndexedPropDesc[0], null);
            try
            {
                IndexMetadata.AddIndexExplicit(true, KeyIndexMultiKey, TableName, TableModuleName, null, "");
            }
            catch (ExprValidationException e)
            {
                throw new EPException("Failed to add primary key index: " + e.Message, e);
            }
        }
Example #13
0
 public TfIdfScorer(TermIndex index, IndexMetadata indexMetadata)
 {
     this.index = index;
     this.indexMetadata = indexMetadata;
 }
Example #14
0
 public BestMatchRanker(TermIndex index, IndexMetadata indexMetadata)
 {
     this.index = index;
     this.indexMetadata = indexMetadata;
     this.scorer = new BM25Scorer(index, indexMetadata);
 }
Example #15
0
    public IndexingStats Index(string site)
    {
        IndexingStats result = new IndexingStats();

        DirectoryInfo directoryInfo = new DirectoryInfo(directory + site);
        if (!directoryInfo.Exists)
        {
            return result;
        }

        DateTime start = DateTime.Now;
        using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create))
        {
            using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create))
            {
                // Index the corpus
                SpimiIndexer indexer = new SpimiIndexer(
                    new BasicLexer(),
                    new HtmlParser(),
                    indexFileStream,
                    metadataFileStream);

                WebCrawler crawler = new WebCrawler(directoryInfo);
                foreach (WebDocument doc in crawler.GetDocuments())
                {
                    Stream stream = doc.Open();
                    indexer.Index(doc.Uri, stream);
                    stream.Close();
                }

                indexer.WriteOut();
                IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                result.CollectionSize = indexMetadata.CollectionLengthInDocuments;
            }
        }
        DateTime end = DateTime.Now;
        result.IndexingTime = (end - start).TotalMilliseconds;
        return result;
    }
Example #16
0
 private double GetIdf(TermIndex index, IndexMetadata metadata, string term)
 {
     double idf = Math.Log(((double)metadata.CollectionLengthInDocuments) / index[term].Count);
     return idf;
 }
Example #17
0
    public IList<QueryResult> Query(string query, RankingMode rankingMode)
    {
        using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open))
        {
            using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open))
            {
                IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                TermIndex index = new TermIndex(indexFileStream);
                QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                IList<long> results = queryEngine.Query(query.ToLower(), rankingMode);
                IList<QueryResult> queryResults = new List<QueryResult>();

                int i = 1;
                Console.WriteLine("rank\tscore\ttitle");
                foreach (long docId in results.Take(500))
                {
                    DocumentInfo docInfo;
                    if (indexMetadata.TryGetDocumentInfo(docId, out docInfo))
                    {
                        QueryResult res = new QueryResult()
                        {
                            Title = docInfo.Title,
                            Uri = docInfo.Uri,
                            Score = queryEngine.Scores[docId]
                        };
                        queryResults.Add(res);
                    }
                    else
                    {
                        Console.WriteLine("Found document id in posting list that wasn't indexed in metadata: " + docId);
                    }
                }

                return queryResults;
            }
        }
    }
Example #18
0
 public KMeansClusterFinder(IndexMetadata metadata, TermIndex index)
 {
     this.metadata = metadata;
     this.index = index;
 }
Example #19
0
 public QueryCli(IndexMetadata metadata, TermIndex index)
 {
     this.metadata = metadata;
     this.index = index;
     this.queryEngine = new QueryEngine(index, metadata);
 }
Example #20
0
 public TfIdfRanker(TermIndex index, IndexMetadata indexMetadata)
 {
     this.index = index;
     this.indexMetadata = indexMetadata;
     this.scorer = new TfIdfScorer(index, indexMetadata);
 }
Example #21
0
 public ReutersReader(string directory, ReutersParser parser, IndexMetadata metadata)
 {
     this.directory = directory;
     this.parser = parser;
     this.metadata = metadata;
 }