Пример #1
0
        //handle cases when the word is a number
        private string handleNumber(DocumentData docData, string word, ref int i, ref bool chopedLast)
        {
            string termStr;
            bool   isDec = word.Contains('.');

            //check if the number is decimal
            if (isDec)
            {
                //check if the number bigger then million
                if (i < splited.Length - 1 && isBigAfter(ref i))
                {
                    termStr = parseBig(word, ref i);
                    i       = i + 2;
                }
                else
                {
                    termStr = word;
                    i++;
                }
            }
            else
            {
                //check if the number is bigger then million
                if (i < splited.Length - 1 && isBigAfter(ref i))
                {
                    termStr = parseBig(word, ref i);
                    i       = i + 2;
                }
                else
                {
                    termStr = parseNumber(word);
                    i++;
                }
            }

            //if the word is not end with punctuation or symbol
            if (!chopedLast)
            {
                if (i < splited.Length)
                {
                    if (isAfracWithAfter(splited[i]))
                    {
                        termStr += parseFrac(ref i);
                        //move to the next word
                        i++;
                    }

                    //check if the word conatain -
                    else if (splited[i].Contains('-'))
                    {
                        string tmpStr = (splited[i]);
                        while (tmpStr != "" && toRemoveLast(tmpStr))
                        {
                            chopedLast = true;
                            tmpStr     = tmpStr.Substring(0, tmpStr.Length - 1);
                        }

                        string[] splitedStr = tmpStr.Split('-');
                        if (isAfrac(splitedStr[0]))
                        {
                            termStr += " " + handleHyphen(tmpStr, ref i, ref chopedLast).ToLower();
                        }
                    }
                }

                //theck if to continue to check the next string;
                if (!toRemoveLast(splited[i - 1]))
                {
                    checkafterNumber(ref i, ref termStr);
                }
            }
            //add the lowerCase of the term to the dictionary
            addToDic(termStr.ToLower(), docData.Name);
            return(termStr);
        }
Пример #2
0
        //Parse a document
        public Dictionary <string, int> parseFile(DocumentData docData, string content)
        {
            parserList = new List <string>();
            //add the data from the readFile instance
            docInfo[docData.Name] = docData;
            termsDic   = new Dictionary <string, int>();
            maxFrecInt = 0;
            string[] delimiters = { " ", "\r\n", "\n", "--" };
            string   word;
            int      i = 0;

            //split the document's content into a string array
            splited = content.Split(delimiters, StringSplitOptions.RemoveEmptyEntries);
            string termStr = "";

            //indicate if the word end with puncuation
            bool chopedLast = false;

            //indicate if the word already parsed
            bool parsed = false;

            //find match role for each term
            while (i < splited.Length)
            {
                parsed     = false;
                chopedLast = false;

                //chop the start of the word if it contain un wanted characters
                word = chopStart(splited[i]);


                while (word != "" && toRemoveLast(word))
                {
                    chopedLast = true;
                    word       = word.Substring(0, word.Length - 1);
                }


                //delete un wanted character
                word = toRemoveAny(word);

                //if the word not conatain any word or digit it not parsed
                if (word == "" || !(word.Any(char.IsLetterOrDigit)))
                {
                    i++;
                }

                //not parse the word "Language"  ????
                else if (word == "Language")
                {
                    i++;
                }


                else if (splited[i] == "<F")
                {
                    while (i < splited.Length - 1 && splited[i] != "</F>")
                    {
                        i++;
                    }
                    i++;
                }

                //check if the word is the article type
                else if (word == "Article" && splited[i + 1].StartsWith("Type")) //////check if to save the Type
                {
                    i = i + 3;
                }


                else if (splited[i] == "[Text]")
                {
                    i++;
                }


                else if (splited[i][0] == '<' || splited[i][splited[i].Length - 1] == '>')
                {
                    i++;
                }

                //chack if the word starts a date and if so add the date to the dictionary
                else if (checkAndParseDate(ref i, docData.Name) == true)
                {
                }

                //check if the word match to the between rule
                else if (!parsed && !chopedLast && i < splited.Length - 3 && (word == "Between" || word == "between") && (splited[i + 2] == "And" || splited[i + 2] == "and") && (isNumber(splited[i + 1]) || isAfrac(splited[i + 1])) && (isNumberWithAfter(splited[i + 3]) || isAfracWithAfter(splited[i + 3])))
                {
                    string tmpS = splited[i + 3];

                    //chope the unnececery chars from the beginning of the word
                    while (toRemoveLast(tmpS))
                    {
                        tmpS = tmpS.Substring(0, tmpS.Length - 1);
                    }
                    termStr = "between " + parseNumber(splited[i + 1]) + " and " + parseNumber(tmpS);
                    i       = i + 4;
                    addToDic(termStr, docData.Name);
                }

                //check if the word and those after it are contain only capital letters and store all consecutive at one term
                else if (!chopedLast && i < splited.Length && word.Length > 1 && (word.All(char.IsUpper)))
                {
                    handleCapitalLetters(docData, ref word, ref i);
                }

                //check if the word starts with $
                else if (word[0] == '$')
                {
                    handleDollarAtStart(docData, ref word, ref i, ref termStr);
                }



                //check if there is an hyphen
                else if (word.Contains('-') && !word.Contains("--") && word[0] != '-' && word[word.Length - 1] != '-')
                {
                    termStr = handleHyphen(word, ref i, ref chopedLast);

                    //store the new term in the parser's dictionary
                    addToDic(termStr.ToLower(), docData.Name);
                }

                //check if the wod is a fraction
                else if (isAfrac(word))
                {
                    termStr = word;
                    i++;
                    addToDic(termStr, docData.Name);
                }

                //check if the word is number
                else if (isNumber(word))
                {
                    termStr = handleNumber(docData, word, ref i, ref chopedLast);
                }

                //check if the word contain only letters
                else if (word.All(char.IsLetter))
                {
                    word = word.ToLower();
                    if (!stopWords.Contains(word))
                    {
                        termStr = word;// + "#\n";
                        addToDic(termStr.ToLower(), docData.Name);
                    }
                    i++;
                }

                //check if the word is a stopword
                else if (stopWords.Contains(word.ToLower()))
                {
                    i++;
                }

                //if the word didnt match any rule add it to the dictionary
                else if (!parsed)
                {
                    termStr = word;
                    addToDic(termStr.ToLower(), docData.Name);
                    i++;
                }
            }

            //check if there is need to stem
            if (stemBool == true)
            {
                Dictionary <string, int> termsDicStemer = new Dictionary <string, int>();
                foreach (string term in termsDic.Keys)
                {
                    string stem = stemmer.stemTerm(term);
                    if (termsDicStemer.ContainsKey(stem))
                    {
                        termsDicStemer[stem] = termsDicStemer[stem] + termsDic[term];
                    }
                    else
                    {
                        termsDicStemer.Add(stem, termsDic[term]);
                    }
                }
                //return the dictionary after stemminfg
                return(termsDicStemer);
            }
            docInfo[docData.Name].max_tf = maxFrecInt;

            //return the terms' dictionary
            return(termsDic);
        }
Пример #3
0
        //handle prices
        private void handleDollarAtStart(DocumentData docData, ref string word, ref int i, ref string termStr)
        {
            if (!word.Contains('-'))
            {
                //chop unnececery chars from the rnd of the word
                while (toRemoveLast(word))
                {
                    word = word.Substring(0, word.Length - 1);
                }

                //chop the $ from the beginning
                word = word.Substring(1);
                if (isAfrac(word))
                {
                    termStr = word;
                    i++;
                }
                else if (!isNumber(word))
                {
                    i++;
                    //if the word is not number add it as a term
                    addToDic(word.ToLower(), docData.Name);
                }
                //if the word is valid number
                else
                {
                    //check if thenumber bigger then a million
                    if (!isBigAfter(ref i))
                    {
                        termStr = parseNumber(word);
                        i++;
                        if (isAfracWithAfter(splited[i]))
                        {
                            termStr += parseFrac(ref i);
                            i++;
                        }
                    }
                    else
                    {
                        termStr = parseBig(word, ref i);
                        i      += 2;
                    }
                }

                // add "dollars" to the end of the string
                termStr += " dollars";
                addToDic(termStr.ToLower(), docData.Name);
            }


            //parse the number after the hyphen
            else if (word.Length > 1)
            {
                i++;
                termStr = handleHyphenDollar(word);
                addToDic(termStr.ToLower(), docData.Name);
            }
            else
            {
                i++;
            }
        }