Exemple #1
0
        //Parse a document
        public Dictionary <string, int> parseFile(DocumentData docData, string content)
        {
            parserList = new List <string>();
            //add the data from the readFile instance
            docInfo[docData.Name] = docData;
            termsDic   = new Dictionary <string, int>();
            maxFrecInt = 0;
            string[] delimiters = { " ", "\r\n", "\n", "--" };
            string   word;
            int      i = 0;

            //split the document's content into a string array
            splited = content.Split(delimiters, StringSplitOptions.RemoveEmptyEntries);
            string termStr = "";

            //indicate if the word end with puncuation
            bool chopedLast = false;

            //indicate if the word already parsed
            bool parsed = false;

            //find match role for each term
            while (i < splited.Length)
            {
                parsed     = false;
                chopedLast = false;

                //chop the start of the word if it contain un wanted characters
                word = chopStart(splited[i]);


                while (word != "" && toRemoveLast(word))
                {
                    chopedLast = true;
                    word       = word.Substring(0, word.Length - 1);
                }


                //delete un wanted character
                word = toRemoveAny(word);

                //if the word not conatain any word or digit it not parsed
                if (word == "" || !(word.Any(char.IsLetterOrDigit)))
                {
                    i++;
                }

                //not parse the word "Language"  ????
                else if (word == "Language")
                {
                    i++;
                }


                else if (splited[i] == "<F")
                {
                    while (i < splited.Length - 1 && splited[i] != "</F>")
                    {
                        i++;
                    }
                    i++;
                }

                //check if the word is the article type
                else if (word == "Article" && splited[i + 1].StartsWith("Type")) //////check if to save the Type
                {
                    i = i + 3;
                }


                else if (splited[i] == "[Text]")
                {
                    i++;
                }


                else if (splited[i][0] == '<' || splited[i][splited[i].Length - 1] == '>')
                {
                    i++;
                }

                //chack if the word starts a date and if so add the date to the dictionary
                else if (checkAndParseDate(ref i, docData.Name) == true)
                {
                }

                //check if the word match to the between rule
                else if (!parsed && !chopedLast && i < splited.Length - 3 && (word == "Between" || word == "between") && (splited[i + 2] == "And" || splited[i + 2] == "and") && (isNumber(splited[i + 1]) || isAfrac(splited[i + 1])) && (isNumberWithAfter(splited[i + 3]) || isAfracWithAfter(splited[i + 3])))
                {
                    string tmpS = splited[i + 3];

                    //chope the unnececery chars from the beginning of the word
                    while (toRemoveLast(tmpS))
                    {
                        tmpS = tmpS.Substring(0, tmpS.Length - 1);
                    }
                    termStr = "between " + parseNumber(splited[i + 1]) + " and " + parseNumber(tmpS);
                    i       = i + 4;
                    addToDic(termStr, docData.Name);
                }

                //check if the word and those after it are contain only capital letters and store all consecutive at one term
                else if (!chopedLast && i < splited.Length && word.Length > 1 && (word.All(char.IsUpper)))
                {
                    handleCapitalLetters(docData, ref word, ref i);
                }

                //check if the word starts with $
                else if (word[0] == '$')
                {
                    handleDollarAtStart(docData, ref word, ref i, ref termStr);
                }



                //check if there is an hyphen
                else if (word.Contains('-') && !word.Contains("--") && word[0] != '-' && word[word.Length - 1] != '-')
                {
                    termStr = handleHyphen(word, ref i, ref chopedLast);

                    //store the new term in the parser's dictionary
                    addToDic(termStr.ToLower(), docData.Name);
                }

                //check if the wod is a fraction
                else if (isAfrac(word))
                {
                    termStr = word;
                    i++;
                    addToDic(termStr, docData.Name);
                }

                //check if the word is number
                else if (isNumber(word))
                {
                    termStr = handleNumber(docData, word, ref i, ref chopedLast);
                }

                //check if the word contain only letters
                else if (word.All(char.IsLetter))
                {
                    word = word.ToLower();
                    if (!stopWords.Contains(word))
                    {
                        termStr = word;// + "#\n";
                        addToDic(termStr.ToLower(), docData.Name);
                    }
                    i++;
                }

                //check if the word is a stopword
                else if (stopWords.Contains(word.ToLower()))
                {
                    i++;
                }

                //if the word didnt match any rule add it to the dictionary
                else if (!parsed)
                {
                    termStr = word;
                    addToDic(termStr.ToLower(), docData.Name);
                    i++;
                }
            }

            //check if there is need to stem
            if (stemBool == true)
            {
                Dictionary <string, int> termsDicStemer = new Dictionary <string, int>();
                foreach (string term in termsDic.Keys)
                {
                    string stem = stemmer.stemTerm(term);
                    if (termsDicStemer.ContainsKey(stem))
                    {
                        termsDicStemer[stem] = termsDicStemer[stem] + termsDic[term];
                    }
                    else
                    {
                        termsDicStemer.Add(stem, termsDic[term]);
                    }
                }
                //return the dictionary after stemminfg
                return(termsDicStemer);
            }
            docInfo[docData.Name].max_tf = maxFrecInt;

            //return the terms' dictionary
            return(termsDic);
        }
Exemple #2
0
        public void strToLowerCase(string s, int fa)
        {
            string str = s.ToLower().Trim('-').TrimStart(new char[] { ' ' }).TrimEnd(' ').Replace("/", "");

            if (str.Length != 0)
            {
                str = str.Trim(' ');
                if (stop_words.Contains(str))
                {
                    return;
                }
                double doub = 0;
                if (s.Contains(".") && Double.TryParse(s, out doub))
                {
                    doub = System.Math.Round(doub, 2);
                    string stri = "" + doub;
                    if (!termPerDoc.ContainsKey(stri))
                    {
                        termPerDoc.Add(stri, new termInfo(1, fa));
                    }
                    else
                    {
                        termPerDoc[stri].tf++;
                        if (termPerDoc[stri].tf > maxtf)
                        {
                            maxtf   = termPerDoc[stri].tf;
                            maxterm = stri;
                        }
                    }
                    return;
                }
                if (str.Contains("-"))
                {
                    splitslash(str, fa);
                    return;
                }
                if (prefix.ContainsKey(str))
                {
                    str = prefix[str];
                }
                if (toStem)
                {
                    if (stemmers.ContainsKey(str))
                    {
                        str = stemmers[str];
                    }
                    else
                    {
                        stemmers.Add(str, stem.stemTerm(str));
                        str = stemmers[str];
                    }
                }

                if (!termPerDoc.ContainsKey(str))
                {
                    termPerDoc.Add(str, new termInfo(1, fa));
                }
                else
                {
                    termPerDoc[str].tf++;
                    if (termPerDoc[str].tf > maxtf)
                    {
                        maxtf   = termPerDoc[str].tf;
                        maxterm = str;
                    }
                }
            }
        }