//handle cases when the word is a number private string handleNumber(DocumentData docData, string word, ref int i, ref bool chopedLast) { string termStr; bool isDec = word.Contains('.'); //check if the number is decimal if (isDec) { //check if the number bigger then million if (i < splited.Length - 1 && isBigAfter(ref i)) { termStr = parseBig(word, ref i); i = i + 2; } else { termStr = word; i++; } } else { //check if the number is bigger then million if (i < splited.Length - 1 && isBigAfter(ref i)) { termStr = parseBig(word, ref i); i = i + 2; } else { termStr = parseNumber(word); i++; } } //if the word is not end with punctuation or symbol if (!chopedLast) { if (i < splited.Length) { if (isAfracWithAfter(splited[i])) { termStr += parseFrac(ref i); //move to the next word i++; } //check if the word conatain - else if (splited[i].Contains('-')) { string tmpStr = (splited[i]); while (tmpStr != "" && toRemoveLast(tmpStr)) { chopedLast = true; tmpStr = tmpStr.Substring(0, tmpStr.Length - 1); } string[] splitedStr = tmpStr.Split('-'); if (isAfrac(splitedStr[0])) { termStr += " " + handleHyphen(tmpStr, ref i, ref chopedLast).ToLower(); } } } //theck if to continue to check the next string; if (!toRemoveLast(splited[i - 1])) { checkafterNumber(ref i, ref termStr); } } //add the lowerCase of the term to the dictionary addToDic(termStr.ToLower(), docData.Name); return(termStr); }
//Parse a document public Dictionary <string, int> parseFile(DocumentData docData, string content) { parserList = new List <string>(); //add the data from the readFile instance docInfo[docData.Name] = docData; termsDic = new Dictionary <string, int>(); maxFrecInt = 0; string[] delimiters = { " ", "\r\n", "\n", "--" }; string word; int i = 0; //split the document's content into a string array splited = content.Split(delimiters, StringSplitOptions.RemoveEmptyEntries); string termStr = ""; //indicate if the word end with puncuation bool chopedLast = false; //indicate if the word already parsed bool parsed = false; //find match role for each term while (i < splited.Length) { parsed = false; chopedLast = false; //chop the start of the word if it contain un wanted characters word = chopStart(splited[i]); while (word != "" && toRemoveLast(word)) { chopedLast = true; word = word.Substring(0, word.Length - 1); } //delete un wanted character word = toRemoveAny(word); //if the word not conatain any word or digit it not parsed if (word == "" || !(word.Any(char.IsLetterOrDigit))) { i++; } //not parse the word "Language" ???? else if (word == "Language") { i++; } else if (splited[i] == "<F") { while (i < splited.Length - 1 && splited[i] != "</F>") { i++; } i++; } //check if the word is the article type else if (word == "Article" && splited[i + 1].StartsWith("Type")) //////check if to save the Type { i = i + 3; } else if (splited[i] == "[Text]") { i++; } else if (splited[i][0] == '<' || splited[i][splited[i].Length - 1] == '>') { i++; } //chack if the word starts a date and if so add the date to the dictionary else if (checkAndParseDate(ref i, docData.Name) == true) { } //check if the word match to the between rule else if (!parsed && !chopedLast && i < splited.Length - 3 && (word == "Between" || word == "between") && (splited[i + 2] == "And" || splited[i + 2] == "and") && (isNumber(splited[i + 1]) || isAfrac(splited[i + 1])) && (isNumberWithAfter(splited[i + 3]) || isAfracWithAfter(splited[i + 3]))) { string tmpS = splited[i + 3]; //chope the unnececery chars from the beginning of the word while (toRemoveLast(tmpS)) { tmpS = tmpS.Substring(0, tmpS.Length - 1); } termStr = "between " + parseNumber(splited[i + 1]) + " and " + parseNumber(tmpS); i = i + 4; addToDic(termStr, docData.Name); } //check if the word and those after it are contain only capital letters and store all consecutive at one term else if (!chopedLast && i < splited.Length && word.Length > 1 && (word.All(char.IsUpper))) { handleCapitalLetters(docData, ref word, ref i); } //check if the word starts with $ else if (word[0] == '$') { handleDollarAtStart(docData, ref word, ref i, ref termStr); } //check if there is an hyphen else if (word.Contains('-') && !word.Contains("--") && word[0] != '-' && word[word.Length - 1] != '-') { termStr = handleHyphen(word, ref i, ref chopedLast); //store the new term in the parser's dictionary addToDic(termStr.ToLower(), docData.Name); } //check if the wod is a fraction else if (isAfrac(word)) { termStr = word; i++; addToDic(termStr, docData.Name); } //check if the word is number else if (isNumber(word)) { termStr = handleNumber(docData, word, ref i, ref chopedLast); } //check if the word contain only letters else if (word.All(char.IsLetter)) { word = word.ToLower(); if (!stopWords.Contains(word)) { termStr = word;// + "#\n"; addToDic(termStr.ToLower(), docData.Name); } i++; } //check if the word is a stopword else if (stopWords.Contains(word.ToLower())) { i++; } //if the word didnt match any rule add it to the dictionary else if (!parsed) { termStr = word; addToDic(termStr.ToLower(), docData.Name); i++; } } //check if there is need to stem if (stemBool == true) { Dictionary <string, int> termsDicStemer = new Dictionary <string, int>(); foreach (string term in termsDic.Keys) { string stem = stemmer.stemTerm(term); if (termsDicStemer.ContainsKey(stem)) { termsDicStemer[stem] = termsDicStemer[stem] + termsDic[term]; } else { termsDicStemer.Add(stem, termsDic[term]); } } //return the dictionary after stemminfg return(termsDicStemer); } docInfo[docData.Name].max_tf = maxFrecInt; //return the terms' dictionary return(termsDic); }
//handle prices private void handleDollarAtStart(DocumentData docData, ref string word, ref int i, ref string termStr) { if (!word.Contains('-')) { //chop unnececery chars from the rnd of the word while (toRemoveLast(word)) { word = word.Substring(0, word.Length - 1); } //chop the $ from the beginning word = word.Substring(1); if (isAfrac(word)) { termStr = word; i++; } else if (!isNumber(word)) { i++; //if the word is not number add it as a term addToDic(word.ToLower(), docData.Name); } //if the word is valid number else { //check if thenumber bigger then a million if (!isBigAfter(ref i)) { termStr = parseNumber(word); i++; if (isAfracWithAfter(splited[i])) { termStr += parseFrac(ref i); i++; } } else { termStr = parseBig(word, ref i); i += 2; } } // add "dollars" to the end of the string termStr += " dollars"; addToDic(termStr.ToLower(), docData.Name); } //parse the number after the hyphen else if (word.Length > 1) { i++; termStr = handleHyphenDollar(word); addToDic(termStr.ToLower(), docData.Name); } else { i++; } }