Beispiel #1
0
        /// <summary>
        /// Tokenizes a string.
        /// </summary>
        /// <param name="text">The text to tokenize.</param>
        /// <param name="location">The location of the words that are extracted.</param>
        /// <returns>The tokens.</returns>
        /// <exception cref="ArgumentNullException">If <paramref name="text"/> is <c>null</c>.</exception>
        public static WordInfo[] Tokenize(string text, WordLocation location)
        {
            if (text == null)
            {
                throw new ArgumentNullException("text");
            }

            List <WordInfo> words = new List <WordInfo>(text.Length / 5);           // Average 5 chars/word

            ushort currentIndex = 0, currentWordStart;

            // Skip all trailing splitChars
            currentIndex = SkipSplitChars(0, text);

            currentWordStart = currentIndex;

            while (currentIndex < text.Length && currentIndex < 65500)
            {
                while (currentIndex < text.Length && !Tools.IsSplitChar(text[currentIndex]))
                {
                    currentIndex++;
                }
                string w = text.Substring(currentWordStart, currentIndex - currentWordStart);
                w = Tools.RemoveDiacriticsAndPunctuation(w, true);
                if (!string.IsNullOrEmpty(w))
                {
                    words.Add(new WordInfo(w, currentWordStart, (ushort)words.Count, location));
                }
                currentIndex     = SkipSplitChars((ushort)(currentIndex + 1), text);
                currentWordStart = currentIndex;
            }

            return(words.ToArray());
        }
        /// <summary>
        /// Stores a word in the catalog.
        /// </summary>
        /// <param name="wordText">The word to store.</param>
        /// <param name="document">The document the word occurs in.</param>
        /// <param name="firstCharIndex">The index of the first character of the word in the document the word occurs at.</param>
        /// <param name="wordIndex">The index of the word in the document.</param>
        /// <param name="location">The location of the word.</param>
        /// <param name="newWord">The new word, or <c>null</c>.</param>
        /// <param name="dumpedWord">The dumped word data, or <c>null</c>.</param>
        /// <returns>The dumped word mapping data.</returns>
        /// <remarks>Storing a word in the index is <b>O(n log n)</b>,
        /// where <b>n</b> is the number of words already in the index.</remarks>
        protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex,
                                              WordLocation location, out Word newWord, out DumpedWord dumpedWord)
        {
            wordText = wordText.ToLower(CultureInfo.InvariantCulture);

            lock (this) {
                Word word = null;

                if (!catalog.TryGetValue(wordText, out word))
                {
                    // Use ZERO as initial ID, update when IndexStorer has stored the word
                    // A reference to this newly-created word must be passed outside this method
                    word = new Word(0, wordText);
                    catalog.Add(wordText, word);
                    newWord    = word;
                    dumpedWord = new DumpedWord(word);
                }
                else
                {
                    newWord    = null;
                    dumpedWord = null;
                }

                word.AddOccurrence(document, firstCharIndex, wordIndex, location);
                return(new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location));
            }
        }
Beispiel #3
0
        /// <summary>
        /// Initializes a new instance of the <see cref="WordInfo" /> class.
        /// </summary>
        /// <param name="text">The text of the word.</param>
        /// <param name="firstCharIndex">The index of the first character of the word in the document.</param>
        /// <param name="wordIndex">The index of the word in the document.</param>
        /// <param name="location">The location of the word in the document.</param>
        /// <exception cref="ArgumentNullException">If <paramref name="text"/> is <c>null</c>.</exception>
        /// <exception cref="ArgumentException">If <paramref name="text"/> is empty.</exception>
        /// <exception cref="ArgumentOutOfRangeException">If <paramref name="firstCharIndex"/> or <paramref name="wordIndex"/> are less than zero.</exception>
        public WordInfo(string text, ushort firstCharIndex, ushort wordIndex, WordLocation location)
            : base(firstCharIndex, wordIndex, location)
        {
            if (text == null)
            {
                throw new ArgumentNullException("text");
            }
            if (text.Length == 0)
            {
                throw new ArgumentException("Invalid text", "text");
            }

            this.text = Tools.RemoveDiacriticsAndPunctuation(text, true);
            //if(this.text.Length == 0) throw new InvalidOperationException();
        }
        /// <summary>
        /// Initializes a new instance of the <see cref="BasicWordInfo" /> class.
        /// </summary>
        /// <param name="firstCharIndex">The index of the first character of the word in the document.</param>
        /// <param name="wordIndex">The index of the word in the document.</param>
        /// <param name="location">The location of the word in the document.</param>
        /// <exception cref="ArgumentOutOfRangeException">If <paramref name="firstCharIndex"/> or <paramref name="wordIndex"/> are less than zero.</exception>
        public BasicWordInfo(ushort firstCharIndex, ushort wordIndex, WordLocation location)
        {
            if (firstCharIndex < 0)
            {
                throw new ArgumentOutOfRangeException("firstCharIndex", "Invalid first char index: must be greater than or equal to zero");
            }
            if (wordIndex < 0)
            {
                throw new ArgumentOutOfRangeException("wordIndex", "Invalid word index: must be greater than or equal to zero");
            }

            this.firstCharIndex = firstCharIndex;
            this.wordIndex      = wordIndex;
            this.location       = location;
        }
Beispiel #5
0
        /// <summary>
        ///     Stores an occurrence.
        /// </summary>
        /// <param name="document">The document the occurrence is referred to.</param>
        /// <param name="firstCharIndex">The index of the first character of the word in the document.</param>
        /// <param name="wordIndex">The index of the word in the document.</param>
        /// <param name="location">The location of the word.</param>
        /// <remarks>
        ///     Adding an occurrence is <b>O(n)</b>, where <b>n</b> is the number of occurrences
        ///     of the word already stored for the same document. If there were no occurrences previously stored,
        ///     the operation is <b>O(1)</b>.
        /// </remarks>
        /// <exception cref="ArgumentNullException">If <paramref name="document" /> is <c>null</c>.</exception>
        /// <exception cref="ArgumentOutOfRangeException">
        ///     If <paramref name="firstCharIndex" /> or <paramref name="wordIndex" />
        ///     are less than zero.
        /// </exception>
        public void AddOccurrence(IDocument document, ushort firstCharIndex, ushort wordIndex, WordLocation location)
        {
            if (document == null)
            {
                throw new ArgumentNullException("document");
            }
            if (firstCharIndex < 0)
            {
                throw new ArgumentOutOfRangeException("firstCharIndex",
                                                      "Invalid first char index: must be greater than or equal to zero");
            }
            if (wordIndex < 0)
            {
                throw new ArgumentOutOfRangeException("wordIndex",
                                                      "Invalid word index: must be greater than or equal to zero");
            }

            lock (occurrences)
            {
                if (occurrences.ContainsKey(document))
                {
                    // Existing document
                    occurrences[document].Add(new BasicWordInfo(firstCharIndex, wordIndex, location));
                }
                else
                {
                    // New document
                    var set = new SortedBasicWordInfoSet();
                    set.Add(new BasicWordInfo(firstCharIndex, wordIndex, location));
                    occurrences.Add(document, set);
                }
            }
        }
        /// <summary>
        /// Initializes index data by completely emptying the index catalog and storing the specified data.
        /// </summary>
        /// <param name="documents">The documents.</param>
        /// <param name="words">The words.</param>
        /// <param name="mappings">The mappings.</param>
        /// <remarks>The method <b>does not</b> check the consistency of the data passed as arguments.</remarks>
        /// <exception cref="ArgumentNullException">If <paramref name="documents"/>, <paramref name="words"/> or <paramref name="mappings"/> are <c>null</c>.</exception>
        /// <exception cref="InvalidOperationException">If <see cref="M:SetBuildDocumentDelegate"/> was not called.</exception>
        public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings)
        {
            if (documents == null)
            {
                throw new ArgumentNullException("documents");
            }
            if (words == null)
            {
                throw new ArgumentNullException("words");
            }
            if (mappings == null)
            {
                throw new ArgumentNullException("mappings");
            }

            if (buildDocument == null)
            {
                throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set");
            }

            lock (this) {
                catalog.Clear();
                catalog = new Dictionary <string, Word>(words.Length);

                // Contains the IDs of documents that are missing
                List <uint> missingDocuments = new List <uint>(50);

                // 1. Prepare a dictionary with all documents for use in the last step
                Dictionary <uint, IDocument> tempDocuments = new Dictionary <uint, IDocument>(documents.Length);
                foreach (DumpedDocument doc in documents)
                {
                    IDocument builtDoc = buildDocument(doc);
                    // Null means that the document no longer exists - silently skip it
                    if (builtDoc != null)
                    {
                        tempDocuments.Add(doc.ID, builtDoc);
                    }
                    else
                    {
                        missingDocuments.Add(doc.ID);
                    }
                }

                // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step
                Dictionary <ulong, Word> tempWords = new Dictionary <ulong, Word>(words.Length);

                // Test for hashing algorithm -- no more used since sequential IDs
                //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) {
                //	throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm");
                //}

                foreach (DumpedWord w in words)
                {
                    Word word = new Word(w.ID, w.Text);

                    /*if(tempWords.ContainsKey(w.ID)) {
                     *      string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]);
                     *      Console.WriteLine(t);
                     * }*/
                    tempWords.Add(w.ID, word);

                    /*if(catalog.ContainsKey(w.Text)) {
                     *      string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]);
                     *      Console.WriteLine(t);
                     * }*/
                    catalog.Add(w.Text, word);
                }

                // 3. Add mappings and documents
                foreach (DumpedWordMapping map in mappings)
                {
                    // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words
                    if (!missingDocuments.Contains(map.DocumentID))
                    {
                        try {
                            tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID],
                                                                map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location));
                        }
                        catch (KeyNotFoundException) { }
                    }
                }
            }
        }