Represents a document structured for easy dumped on disk or database.
The class is not thread-safe.
示例#1
0
        /// <summary>
        /// Initializes a new instance of the <see cref="T:FileDocument" /> class.
        /// </summary>
        /// <param name="doc">The dumped document.</param>
        public FileDocument(DumpedDocument doc)
        {
            string[] fields = doc.Name.Split('|');

            id = doc.ID;
            name = doc.Name;
            title = doc.Title;
            dateTime = doc.DateTime;
            provider = fields[0];
        }
        /// <summary>
        /// Initializes a new instance of the <see cref="T:PageAttachmentDocument" /> class.
        /// </summary>
        /// <param name="doc">The dumped document.</param>
        public PageAttachmentDocument(DumpedDocument doc)
        {
            string[] fields = doc.Name.Split('|');

            id = doc.ID;
            name = doc.Name;
            title = doc.Title;
            dateTime = doc.DateTime;
            provider = fields[0];
            page = Pages.FindPage(fields[1]);
        }
示例#3
0
        /// <summary>
        /// Initializes a new instance of the <see cref="T:PageDocument" /> class.
        /// </summary>
        /// <param name="pageInfo">The page.</param>
        /// <param name="dumpedDocument">The dumped document data.</param>
        /// <param name="tokenizer">The tokenizer.</param>
        public PageDocument(PageInfo pageInfo, DumpedDocument dumpedDocument, Tokenizer tokenizer)
        {
            if(dumpedDocument == null) throw new ArgumentNullException("dumpedDocument");
            if(tokenizer == null) throw new ArgumentNullException("tokenizer");

            this.pageInfo = pageInfo;
            id = dumpedDocument.ID;
            name = dumpedDocument.Name;
            typeTag = dumpedDocument.TypeTag;
            title = dumpedDocument.Title;
            dateTime = dumpedDocument.DateTime;
            this.tokenizer = tokenizer;
        }
示例#4
0
        /// <summary>
        /// Initializes a new instance of the <see cref="DumpedChange" /> class.
        /// </summary>
        /// <param name="document">The dumped document data.</param>
        /// <param name="words">The list of dumped words data.</param>
        /// <param name="mappings">The list of dumped mappings data.</param>
        /// <exception cref="ArgumentNullException">If <paramref name="document"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception>
        public DumpedChange(DumpedDocument document, List<DumpedWord> words, List<DumpedWordMapping> mappings)
        {
            if(document == null) throw new ArgumentNullException("document");
            if(words == null) throw new ArgumentNullException("words");
            if(mappings == null) throw new ArgumentNullException("mappings");

            // mappings can be empty if the document did not have any indexable content
            //if(mappings.Count == 0) throw new ArgumentException("Mappings cannot be empty", "mappings");

            this.document = document;
            this.words = words;
            this.mappings = mappings;
        }
示例#5
0
        /// <summary>
        /// Initializes a new instance of the <see cref="DumpedChange" /> class.
        /// </summary>
        /// <param name="document">The dumped document data.</param>
        /// <param name="words">The list of dumped words data.</param>
        /// <param name="mappings">The list of dumped mappings data.</param>
        /// <exception cref="ArgumentNullException">If <paramref name="document"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception>
        public DumpedChange(DumpedDocument document, List <DumpedWord> words, List <DumpedWordMapping> mappings)
        {
            if (document == null)
            {
                throw new ArgumentNullException("document");
            }
            if (words == null)
            {
                throw new ArgumentNullException("words");
            }
            if (mappings == null)
            {
                throw new ArgumentNullException("mappings");
            }

            // mappings can be empty if the document did not have any indexable content
            //if(mappings.Count == 0) throw new ArgumentException("Mappings cannot be empty", "mappings");

            this.document = document;
            this.words    = words;
            this.mappings = mappings;
        }
        /// <summary>
        /// Indexes a message.
        /// </summary>
        /// <param name="page">The page.</param>
        /// <param name="id">The message ID.</param>
        /// <param name="subject">The subject.</param>
        /// <param name="dateTime">The date/time.</param>
        /// <param name="body">The body.</param>
        /// <returns>The number of indexed words, including duplicates.</returns>
        private int IndexMessage(PageInfo page, int id, string subject, DateTime dateTime, string body)
        {
            lock(this) {
                try {
                    // Trim "RE:" to avoid polluting the search engine index
                    if(subject.ToLowerInvariant().StartsWith("re:") && subject.Length > 3) subject = subject.Substring(3).Trim();

                    string documentName = MessageDocument.GetDocumentName(page, id);

                    DumpedDocument ddoc = new DumpedDocument(0, documentName, host.PrepareTitleForIndexing(null, subject),
                        MessageDocument.StandardTypeTag, dateTime);

                    // Store the document
                    // The content should always be prepared using IHost.PrepareForSearchEngineIndexing()
                    int count = index.StoreDocument(new MessageDocument(page, id, ddoc, TokenizeContent), null,
                        host.PrepareContentForIndexing(null, body), null);

                    if(count == 0 && body.Length > 0) {
                        host.LogEntry("Indexed 0 words for message " + page.FullName + ":" + id.ToString() + ": possible index corruption. Please report this error to the developers",
                            LogEntryType.Warning, null, this);
                    }

                    return count;
                }
                catch(Exception ex) {
                    host.LogEntry("Message indexing error for " + page.FullName + ":" + id.ToString() + " (skipping message): " + ex.ToString(), LogEntryType.Error, null, this);
                    return 0;
                }
            }
        }
示例#7
0
 /// <summary>
 /// Writes a <see cref="DumpedDocument" /> to a <see cref="BinaryWriter" />.
 /// </summary>
 /// <param name="writer">The <see cref="BinaryWriter" />.</param>
 /// <param name="document">The <see cref="DumpedDocument" />.</param>
 private static void WriteDumpedDocument(BinaryWriter writer, DumpedDocument document)
 {
     writer.Write(document.ID);
     writer.Write(document.Name);
     writer.Write(document.Title);
     writer.Write(document.TypeTag);
     writer.Write(document.DateTime.ToBinary());
 }
示例#8
0
 /// <summary>
 /// Determines whether two <see cref="DumpedDocument" />s are equal.
 /// </summary>
 /// <param name="d1">The first document.</param>
 /// <param name="d2">The second document.</param>
 /// <returns><c>true</c> if the documents are equal, <c>false</c> otherwise.</returns>
 private static bool EqualDumpedDocument(DumpedDocument d1, DumpedDocument d2)
 {
     // Only consider ID, Name and TypeTag
     //return d1.ID == d2.ID && d1.Name == d2.Name && d1.Title == d2.Title &&
     //	d1.TypeTag == d2.TypeTag && d1.DateTime == d2.DateTime;
     return d1.ID == d2.ID && d1.Name == d2.Name && d1.TypeTag == d2.TypeTag;
 }
示例#9
0
        /// <summary>
        /// Loads the index from the data store the first time.
        /// </summary>
        /// <param name="documents">The dumped documents.</param>
        /// <param name="words">The dumped words.</param>
        /// <param name="mappings">The dumped word mappings.</param>
        protected override void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings)
        {
            uint maxDocumentId = 0;
            uint maxWordId = 0;

            // 1. Load Documents
            using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
                int count = ReadCount(fs);
                BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
                documents = new DumpedDocument[count];
                for(int i = 0; i < count; i++) {
                    documents[i] = ReadDumpedDocument(reader);
                    if(documents[i].ID > maxDocumentId) maxDocumentId = documents[i].ID;
                }
                firstFreeDocumentId = maxDocumentId + 1;
            }

            // 2. Load Words
            using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
                int count = ReadCount(fs);
                BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
                words = new DumpedWord[count];
                for(int i = 0; i < count; i++) {
                    words[i] = ReadDumpedWord(reader);
                    if(words[i].ID > maxWordId) maxWordId = words[i].ID;
                }
                firstFreeWordId = maxWordId + 1;
            }

            // 3. Load Mappings
            using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
                int count = ReadCount(fs);
                BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
                mappings = new DumpedWordMapping[count];
                for(int i = 0; i < count; i++) {
                    mappings[i] = ReadDumpedWordMapping(reader);
                }
            }
        }
示例#10
0
        /// <summary>
        /// Removes a page from the search engine index.
        /// </summary>
        /// <param name="content">The content of the page to remove.</param>
        private void UnindexPage(PageContent content)
        {
            lock(this) {
                string documentName = PageDocument.GetDocumentName(content.PageInfo);

                DumpedDocument ddoc = new DumpedDocument(0, documentName, host.PrepareTitleForIndexing(content.PageInfo, content.Title),
                    PageDocument.StandardTypeTag, content.LastModified);
                index.RemoveDocument(new PageDocument(content.PageInfo, ddoc, TokenizeContent), null);
            }
        }
示例#11
0
        /// <summary>
        /// Removes a message from the search engine index.
        /// </summary>
        /// <param name="page">The page.</param>
        /// <param name="id">The message ID.</param>
        /// <param name="subject">The subject.</param>
        /// <param name="dateTime">The date/time.</param>
        /// <param name="body">The body.</param>
        /// <returns>The number of indexed words, including duplicates.</returns>
        private void UnindexMessage(PageInfo page, int id, string subject, DateTime dateTime, string body)
        {
            lock(this) {
                // Trim "RE:" to avoid polluting the search engine index
                if(subject.ToLowerInvariant().StartsWith("re:") && subject.Length > 3) subject = subject.Substring(3).Trim();

                string documentName = MessageDocument.GetDocumentName(page, id);

                DumpedDocument ddoc = new DumpedDocument(0, documentName, host.PrepareTitleForIndexing(null, subject),
                    MessageDocument.StandardTypeTag, DateTime.Now);
                index.RemoveDocument(new MessageDocument(page, id, ddoc, TokenizeContent), null);
            }
        }
示例#12
0
        /// <summary>
        /// Indexes a page.
        /// </summary>
        /// <param name="content">The content of the page.</param>
        /// <returns>The number of indexed words, including duplicates.</returns>
        private int IndexPage(PageContent content)
        {
            lock(this) {
                try {
                    string documentName = PageDocument.GetDocumentName(content.PageInfo);

                    DumpedDocument ddoc = new DumpedDocument(0, documentName, host.PrepareTitleForIndexing(content.PageInfo, content.Title),
                        PageDocument.StandardTypeTag, content.LastModified);

                    // Store the document
                    // The content should always be prepared using IHost.PrepareForSearchEngineIndexing()
                    int count = index.StoreDocument(new PageDocument(content.PageInfo, ddoc, TokenizeContent),
                        content.Keywords, host.PrepareContentForIndexing(content.PageInfo, content.Content), null);

                    if(count == 0 && content.Content.Length > 0) {
                        host.LogEntry("Indexed 0 words for page " + content.PageInfo.FullName + ": possible index corruption. Please report this error to the developers",
                            LogEntryType.Warning, null, this);
                    }

                    return count;
                }
                catch(Exception ex) {
                    host.LogEntry("Page indexing error for " + content.PageInfo.FullName + " (skipping page): " + ex.ToString(), LogEntryType.Error, null, this);
                    return 0;
                }
            }
        }
示例#13
0
 /// <summary>
 /// Loads the index from the data store the first time.
 /// </summary>
 /// <param name="documents">The dumped documents.</param>
 /// <param name="words">The dumped words.</param>
 /// <param name="mappings">The dumped word mappings.</param>
 protected abstract void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings);
示例#14
0
        /// <summary>
        /// Handles the construction of an <see cref="T:IDocument" /> for the search engine.
        /// </summary>
        /// <param name="dumpedDocument">The input dumped document.</param>
        /// <returns>The resulting <see cref="T:IDocument" />.</returns>
        private IDocument BuildDocumentHandler(DumpedDocument dumpedDocument)
        {
            if(dumpedDocument.TypeTag == PageDocument.StandardTypeTag) {
                string pageName = PageDocument.GetPageName(dumpedDocument.Name);

                PageInfo page = FindPage(NameTools.GetNamespace(pageName), NameTools.GetLocalName(pageName),
                    GetAllPages());

                if(page == null) return null;
                else return new PageDocument(page, dumpedDocument, TokenizeContent);
            }
            else if(dumpedDocument.TypeTag == MessageDocument.StandardTypeTag) {
                string pageFullName;
                int id;
                MessageDocument.GetMessageDetails(dumpedDocument.Name, out pageFullName, out id);

                PageInfo page = FindPage(NameTools.GetNamespace(pageFullName), NameTools.GetLocalName(pageFullName), GetAllPages());
                if(page == null) return null;
                else return new MessageDocument(page, id, dumpedDocument, TokenizeContent);
            }
            else return null;
        }
示例#15
0
 /// <summary>
 /// Detects the document in a dumped instance for files and attachments.
 /// </summary>
 /// <param name="doc">The dumped document instance.</param>
 /// <returns>The proper document instance.</returns>
 private static IDocument DetectFileOrAttachment(DumpedDocument doc)
 {
     if(doc.TypeTag == FileDocument.StandardTypeTag) {
         return new FileDocument(doc);
     }
     else if(doc.TypeTag == PageAttachmentDocument.StandardTypeTag) {
         return new PageAttachmentDocument(doc);
     }
     else throw new NotSupportedException();
 }
        /// <summary>
        /// Tries to load all data related to a word from the database.
        /// </summary>
        /// <param name="text">The word text.</param>
        /// <param name="word">The returned word.</param>
        /// <param name="connection">An open database connection.</param>
        /// <returns><c>true</c> if the word is found, <c>false</c> otherwise.</returns>
        private bool TryFindWord(string text, out Word word, DbConnection connection)
        {
            // 1. Find word - if not found, return
            // 2. Read all raw word mappings
            // 3. Read all documents (unique)
            // 4. Build result data structure

            ICommandBuilder builder = GetCommandBuilder();
            QueryBuilder queryBuilder = new QueryBuilder(builder);

            string query = queryBuilder.SelectFrom("IndexWord", new string[] { "Id" });
            query = queryBuilder.Where(query, "Text", WhereOperator.Equals, "Text");

            List<Parameter> parameters = new List<Parameter>(1);
            parameters.Add(new Parameter(ParameterType.String, "Text", text));

            DbCommand command = builder.GetCommand(connection, query, parameters);

            int wordId = ExecuteScalar<int>(command, -1, false);

            if(wordId == -1) {
                word = null;
                return false;
            }

            // Read all raw mappings
            query = queryBuilder.SelectFrom("IndexWordMapping");
            query = queryBuilder.Where(query, "Word", WhereOperator.Equals, "WordId");

            parameters = new List<Parameter>(1);
            parameters.Add(new Parameter(ParameterType.Int32, "WordId", wordId));

            command = builder.GetCommand(connection, query, parameters);

            DbDataReader reader = ExecuteReader(command, false);

            List<DumpedWordMapping> mappings = new List<DumpedWordMapping>(2048);
            while(reader != null && reader.Read()) {
                mappings.Add(new DumpedWordMapping((uint)wordId,
                    (uint)(int)reader["Document"],
                    (ushort)(short)reader["FirstCharIndex"], (ushort)(short)reader["WordIndex"],
                    (byte)reader["Location"]));
            }
            CloseReader(reader);

            if(mappings.Count == 0) {
                word = null;
                return false;
            }

            // Find all documents
            query = queryBuilder.SelectFrom("IndexDocument");
            query = queryBuilder.Where(query, "Id", WhereOperator.Equals, "DocId");

            parameters = new List<Parameter>(1);
            parameters.Add(new Parameter(ParameterType.Int32, "DocId", 0));

            Dictionary<uint, IDocument> documents = new Dictionary<uint, IDocument>(64);
            foreach(DumpedWordMapping map in mappings) {
                uint docId = map.DocumentID;
                if(documents.ContainsKey(docId)) continue;

                parameters[0].Value = (int)docId;
                command = builder.GetCommand(connection, query, parameters);

                reader = ExecuteReader(command, false);

                if(reader != null && reader.Read()) {
                    DumpedDocument dumpedDoc = new DumpedDocument(docId,
                        reader["Name"] as string, reader["Title"] as string,
                        reader["TypeTag"] as string,
                        (DateTime)reader["DateTime"]);

                    IDocument document = BuildDocument(dumpedDoc);

                    if(document != null) documents.Add(docId, document);
                }
                CloseReader(reader);
            }

            OccurrenceDictionary occurrences = new OccurrenceDictionary(mappings.Count);
            foreach(DumpedWordMapping map in mappings) {
                if(!occurrences.ContainsKey(documents[map.DocumentID])) {
                    occurrences.Add(documents[map.DocumentID], new SortedBasicWordInfoSet(2));
                }

                occurrences[documents[map.DocumentID]].Add(new BasicWordInfo(
                    map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location)));
            }

            word = new Word((uint)wordId, text, occurrences);
            return true;
        }
        /// <summary>
        /// Handles the construction of an <see cref="T:IDocument" /> for the search engine.
        /// </summary>
        /// <param name="dumpedDocument">The input dumped document.</param>
        /// <returns>The resulting <see cref="T:IDocument" />.</returns>
        private IDocument BuildDocument(DumpedDocument dumpedDocument)
        {
            if(alwaysGenerateDocument) {
                return new DummyDocument() {
                    ID = dumpedDocument.ID,
                    Name = dumpedDocument.Name,
                    Title = dumpedDocument.Title,
                    TypeTag = dumpedDocument.TypeTag,
                    DateTime = dumpedDocument.DateTime
                };
            }

            if(dumpedDocument.TypeTag == PageDocument.StandardTypeTag) {
                string pageName = PageDocument.GetPageName(dumpedDocument.Name);

                PageInfo page = GetPage(pageName);

                if(page == null) return null;
                else return new PageDocument(page, dumpedDocument, TokenizeContent);
            }
            else if(dumpedDocument.TypeTag == MessageDocument.StandardTypeTag) {
                string pageFullName;
                int id;
                MessageDocument.GetMessageDetails(dumpedDocument.Name, out pageFullName, out id);

                PageInfo page = GetPage(pageFullName);
                if(page == null) return null;
                else return new MessageDocument(page, id, dumpedDocument, TokenizeContent);
            }
            else return null;
        }
示例#18
0
        /// <summary>
        /// Initializes index data by completely emptying the index catalog and storing the specified data.
        /// </summary>
        /// <param name="documents">The documents.</param>
        /// <param name="words">The words.</param>
        /// <param name="mappings">The mappings.</param>
        /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks>
        /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception>
        /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception>
        public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings)
        {
            if(documents == null) throw new ArgumentNullException("documents");
            if(words == null) throw new ArgumentNullException("words");
            if(mappings == null) throw new ArgumentNullException("mappings");

            if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set");

            lock(this) {
                catalog.Clear();
                catalog = new Dictionary<string, Word>(words.Length);

                // Contains the IDs of documents that are missing
                List<uint> missingDocuments = new List<uint>(50);

                // 1. Prepare a dictionary with all documents for use in the last step
                Dictionary<uint, IDocument> tempDocuments = new Dictionary<uint, IDocument>(documents.Length);
                foreach(DumpedDocument doc in documents) {
                    IDocument builtDoc = buildDocument(doc);
                    // Null means that the document no longer exists - silently skip it
                    if(builtDoc != null) {
                        tempDocuments.Add(doc.ID, builtDoc);
                    }
                    else {
                        missingDocuments.Add(doc.ID);
                    }
                }

                // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step
                Dictionary<ulong, Word> tempWords = new Dictionary<ulong, Word>(words.Length);

                // Test for hashing algorithm -- no more used since sequential IDs
                //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) {
                //	throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm");
                //}

                foreach(DumpedWord w in words) {
                    Word word = new Word(w.ID, w.Text);
                    /*if(tempWords.ContainsKey(w.ID)) {
                        string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]);
                        Console.WriteLine(t);
                    }*/
                    tempWords.Add(w.ID, word);
                    /*if(catalog.ContainsKey(w.Text)) {
                        string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]);
                        Console.WriteLine(t);
                    }*/
                    catalog.Add(w.Text, word);
                }

                // 3. Add mappings and documents
                foreach(DumpedWordMapping map in mappings) {
                    // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words
                    if(!missingDocuments.Contains(map.DocumentID)) {
                        try {
                            tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID],
                                map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location));
                        }
                        catch(KeyNotFoundException) { }
                    }
                }
            }
        }