Пример #1
0
        // Index the text of the passed document
        private void IndexDocumentText(string sDocText, int nDocID)
        {
            string[] words = LSICommon.Instance.GetWords(sDocText);
            for (int i = 0; i < words.Length; i++)
            {
                words[i] = goStemmmer.stemTerm(words[i]);

                if (!goWordList.ContainsKey(words[i]))
                {
                    goWordList.Add(words[i], goWordList.Count);
                }

                DocWordRelation dwr;
                dwr.DocID  = nDocID;
                dwr.WordID = (int)goWordList[words[i]];

                if (!goDocWord.ContainsKey(dwr))
                {
                    goDocWord.Add(dwr, 1);
                }
                else
                {
                    goDocWord[dwr] = (int)goDocWord[dwr] + 1;
                }
            }
        }
Пример #2
0
 /// <summary>
 /// Use the Word Stemmer interface to generate a stem from a single word.
 /// This uses the Porter word stemmer algorithm.
 /// </summary>
 /// <param name="Word">The word to stem</param>
 /// <returns>The stem of the word</returns>
 public string StemWord(string Word)
 {
     return(Stemmer.stemTerm(Word));
 }
Пример #3
0
        /// <summary>
        /// parse the given text according to parsing rule
        /// </summary>
        /// <param name="text">the text needed to parse</param>
        /// <param name="maxTerm">update the max term in the text</param>
        /// <returns>the dictionary of the terms and it's frequncy in the text</returns>
        public Dictionary <string, int> ParseText(string text, out KeyValuePair <string, int> maxTerm)
        {
            // update the max term
            int    maxTermFreq   = 0;
            string maxTermString = "";
            Dictionary <string, int> parsingDictionary = new Dictionary <string, int>();

            text = RemoveSymbolsFromDocument(text);
            string[] terms = text.Split(delimetersToSplitWords, StringSplitOptions.RemoveEmptyEntries);
            for (int i = 0; i < terms.Length; i++)
            {
                string new_word = terms[i];
                new_word = RemoveSpecialCahracter(new_word);

                if (IsStopWord(new_word))
                {
                    continue;
                }

                // check for date
                if (i < terms.Length - 3 && ((new_word.All(char.IsDigit) || terms[i + 1].All(char.IsDigit)) &&
                                             (Enum.IsDefined(typeof(Months), new_word.ToLower()) || Enum.IsDefined(typeof(Months), terms[i + 1].ToLower()))))
                {
                    int retNum1;
                    if (Int32.TryParse(terms[i + 2], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out retNum1))
                    {
                        new_word = ParseDate(new_word + ' ' + terms[i + 1] + ' ' + terms[i + 2]);
                        i        = i + 2;
                    }
                    else
                    {
                        new_word = ParseDate(new_word + ' ' + terms[i + 1]);
                        i++;
                    }
                }
                // parse expression with numbers
                else if (new_word.Any(char.IsDigit))
                {
                    string next_word = "";
                    if (i < terms.Length - 1)
                    {
                        next_word = terms[i + 1];
                    }
                    int increaseI = 0;
                    new_word = DealWithNumber(terms[i], next_word, out increaseI);
                    if (increaseI == -1)
                    {
                        continue;
                    }
                    i = i + increaseI;
                }
                else
                {
                    new_word = RemoveSpecialCahracterAfterDealingWithNumbers(new_word);
                    if (IsStopWord(new_word) || new_word.ToLower() == "may")
                    {
                        continue;
                    }
                    // check for capital letters
                    if (new_word.Any(c => char.IsUpper(c)))
                    {
                        new_word = (new_word).ToLower();
                        string next_word = "";
                        if (i < terms.Length - 2)
                        {
                            next_word = RemoveSpecialCahracter(terms[i + 1]).ToLower();
                            next_word = RemoveSpecialCahracterAfterDealingWithNumbers(next_word);
                        }

                        if (i < terms.Length - 2 && terms[i + 1].Any(c => char.IsUpper(c)) && !IsStopWord(next_word))
                        {
                            string new_expression = new_word + " " + next_word;

                            if (!parsingDictionary.ContainsKey(next_word))
                            {
                                parsingDictionary[next_word] = 1;
                            }
                            else
                            {
                                parsingDictionary[next_word]++;
                                if (parsingDictionary[next_word] > maxTermFreq)
                                {
                                    maxTermString = next_word;
                                    maxTermFreq   = parsingDictionary[next_word];
                                }
                            }

                            if (!parsingDictionary.ContainsKey(new_expression))
                            {
                                parsingDictionary[new_expression] = 1;
                            }
                            else
                            {
                                parsingDictionary[new_expression]++;
                                if (parsingDictionary[new_expression] > maxTermFreq)
                                {
                                    maxTermString = new_expression;
                                    maxTermFreq   = parsingDictionary[new_expression];
                                }
                            }
                            i++;
                        }
                    }
                }

                if (ToStem)
                {
                    new_word = stemmer.stemTerm(new_word);
                }

                if (!parsingDictionary.ContainsKey(new_word))
                {
                    parsingDictionary[new_word] = 1;
                }

                else
                {
                    parsingDictionary[new_word]++;
                    if (parsingDictionary[new_word] > maxTermFreq)
                    {
                        maxTermString = new_word;
                        maxTermFreq   = parsingDictionary[new_word];
                    }
                }
            }
            if (maxTermFreq == 0 && parsingDictionary.Count > 0)
            {
                maxTermString = parsingDictionary.First().Key;
                maxTermFreq   = parsingDictionary[maxTermString];
            }
            maxTerm = new KeyValuePair <string, int>(maxTermString, maxTermFreq);
            return(parsingDictionary);
        }
Пример #4
0
        /// <summary>
        /// find and saves terms by regex
        /// </summary>
        /// <param name="text">text of doc</param>
        /// <param name="regex">regex to use</param>
        /// <param name="type">type of terms it finds</param>
        /// <param name="docName">the doc to search in</param>
        /// <returns></returns>
        static private int getTerms(ref string text, Regex regex, string type, string docName)
        {
            MatchCollection terms      = regex.Matches(text);
            int             numOfTerms = 0;
            string          termString;

            foreach (Match term in terms)
            {
                termString = term.ToString().ToLower().Replace('\n', ' ').Trim(charsToTrim);

                // Stop words
                if (StopWords.ContainsKey(termString) || termString.Length <= 0)
                {
                    continue;
                }

                if (!(type[0] == 'W') && !(type[0] == 'N') && !(type[0] == 'Q') && !(type[0] == 'C') && !(type[0] == 'H'))
                {
                    string clearTerm = new String('#', term.Length - 2);
                    text = text.Substring(0, term.Index) + " " + clearTerm + " " + text.Substring(term.Index + term.Length);
                }
                DateTime convertedDate;
                switch (type)
                {
                case "Range":
                    MatchCollection numbers = justANumberReg.Matches(termString);
                    int             a, b;
                    int.TryParse(numbers[1].ToString(), out b);
                    int.TryParse(numbers[0].ToString(), out a);
                    if (Math.Abs(a - b) < 20)
                    {
                        for (; a <= b; a++)
                        {
                            addTermToDic(d_abNumTerms, a.ToString(), docName, term.Index, ref numOfTerms, "Number");
                        }
                    }
                    else
                    {
                        addTermToDic(d_abNumTerms, a.ToString(), docName, term.Index, ref numOfTerms, "Number");
                        addTermToDic(d_abNumTerms, b.ToString(), docName, term.Index, ref numOfTerms, "Number");
                    }
                    break;

                case "Percent":
                    string[] percentSplit = termString.Split(' ', '%');
                    float    percent;
                    float.TryParse(percentSplit[0], out percent);
                    addTermToDic(d_abNumTerms, (percent * 0.01).ToString("P"), docName, term.Index, ref numOfTerms, "Percent");
                    break;

                case "Price":
                    MatchCollection number = numReg.Matches(termString);
                    float           price;
                    float.TryParse(number[0].ToString(), out price);
                    if (termString.Contains('m'))
                    {
                        price = price * 1000000;
                    }
                    else if (termString.Contains('n'))
                    {
                        price = price * 1000000000;
                    }
                    addTermToDic(d_abNumTerms, price.ToString("C", new CultureInfo("en-US")), docName, term.Index, ref numOfTerms, "Price");
                    break;

                case "Number":
                    string[] termSplit = termString.Split(' ');
                    Double   numformated;
                    Double.TryParse(termSplit[0].ToString(), out numformated);
                    if (termSplit.Length > 1)
                    {
                        if (termSplit[1][0] == 'm')
                        {
                            numformated = numformated * 1000000;
                        }
                        else if (termSplit[1][0] == 'b')
                        {
                            numformated = numformated * 1000000000;
                        }
                        else if (termSplit[1][0] == 't')
                        {
                            numformated = numformated * 1000000000000;
                        }
                        else if (termSplit[1][0] == 'h')
                        {
                            numformated = numformated * 100;
                        }
                    }

                    addTermToDic(d_abNumTerms, numformated.ToString(), docName, term.Index, ref numOfTerms, "Number");
                    break;

                case "Date":
                    try
                    {
                        //with th
                        int thIndex = termString.IndexOf("th");
                        if (thIndex >= 0)
                        {
                            termString = termString.Remove(thIndex, 2);
                        }
                        convertedDate = Convert.ToDateTime(termString);
                        termString    = convertedDate.ToShortDateString();
                        addTermToDic(d_abNumTerms, termString, docName, term.Index, ref numOfTerms, "Date");
                    }
                    catch (Exception e)
                    {
                        //manually convert
                        string   dd, mm, yyyy;
                        string[] termStringSplited = termString.Split(' ');
                        if (months.ContainsKey(termStringSplited[1]))
                        {
                            dd = termStringSplited[0];
                            mm = months[termStringSplited[1]];
                            if (termStringSplited.Length == 3)
                            {
                                yyyy = termStringSplited[2];
                            }
                            else
                            {
                                yyyy = "2015";
                            }
                        }
                        else
                        {
                            mm = months[termStringSplited[0]];
                            if (termStringSplited.Length == 3)
                            {
                                dd   = termStringSplited[1].Trim(',');
                                yyyy = termStringSplited[2];
                            }
                            else
                            {
                                if (termStringSplited[1].Length <= 2)
                                {
                                    dd   = termStringSplited[1];
                                    yyyy = "2015";
                                }
                                else
                                {
                                    yyyy = termStringSplited[1];
                                    dd   = "01";
                                }
                            }
                        }
                        addTermToDic(d_abNumTerms, dd + "/" + mm + "/" + yyyy, docName, term.Index, ref numOfTerms, "Date");
                    }
                    break;

                case "Year":
                    convertedDate = new DateTime(int.Parse(termString), 1, 1);
                    termString    = convertedDate.ToShortDateString();
                    addTermToDic(d_abNumTerms, termString, docName, term.Index, ref numOfTerms, "Date");
                    break;

                default:
                    //stemmer

                    if (use_stem)
                    {
                        lock (stemmer)
                        {
                            termString = stemmer.stemTerm(termString);
                        }
                    }

                    if (type == "Header")
                    {
                        int j = 1;
                    }

                    //insert to correct dictionary
                    if (termString[0] >= 's')
                    {
                        addTermToDic(d_szTerms, termString, docName, term.Index, ref numOfTerms, type);
                    }
                    else if (termString[0] >= 'n')
                    {
                        addTermToDic(d_nrTerms, termString, docName, term.Index, ref numOfTerms, type);
                    }
                    else if (termString[0] >= 'g')
                    {
                        addTermToDic(d_gmTerms, termString, docName, term.Index, ref numOfTerms, type);
                    }
                    else if (termString[0] >= 'c')
                    {
                        addTermToDic(d_cfTerms, termString, docName, term.Index, ref numOfTerms, type);
                    }
                    else
                    {
                        addTermToDic(d_abNumTerms, termString, docName, term.Index, ref numOfTerms, type);
                    }
                    break;
                }
            }

            return(numOfTerms);
        }