//Parse a document public Dictionary <string, int> parseFile(DocumentData docData, string content) { parserList = new List <string>(); //add the data from the readFile instance docInfo[docData.Name] = docData; termsDic = new Dictionary <string, int>(); maxFrecInt = 0; string[] delimiters = { " ", "\r\n", "\n", "--" }; string word; int i = 0; //split the document's content into a string array splited = content.Split(delimiters, StringSplitOptions.RemoveEmptyEntries); string termStr = ""; //indicate if the word end with puncuation bool chopedLast = false; //indicate if the word already parsed bool parsed = false; //find match role for each term while (i < splited.Length) { parsed = false; chopedLast = false; //chop the start of the word if it contain un wanted characters word = chopStart(splited[i]); while (word != "" && toRemoveLast(word)) { chopedLast = true; word = word.Substring(0, word.Length - 1); } //delete un wanted character word = toRemoveAny(word); //if the word not conatain any word or digit it not parsed if (word == "" || !(word.Any(char.IsLetterOrDigit))) { i++; } //not parse the word "Language" ???? else if (word == "Language") { i++; } else if (splited[i] == "<F") { while (i < splited.Length - 1 && splited[i] != "</F>") { i++; } i++; } //check if the word is the article type else if (word == "Article" && splited[i + 1].StartsWith("Type")) //////check if to save the Type { i = i + 3; } else if (splited[i] == "[Text]") { i++; } else if (splited[i][0] == '<' || splited[i][splited[i].Length - 1] == '>') { i++; } //chack if the word starts a date and if so add the date to the dictionary else if (checkAndParseDate(ref i, docData.Name) == true) { } //check if the word match to the between rule else if (!parsed && !chopedLast && i < splited.Length - 3 && (word == "Between" || word == "between") && (splited[i + 2] == "And" || splited[i + 2] == "and") && (isNumber(splited[i + 1]) || isAfrac(splited[i + 1])) && (isNumberWithAfter(splited[i + 3]) || isAfracWithAfter(splited[i + 3]))) { string tmpS = splited[i + 3]; //chope the unnececery chars from the beginning of the word while (toRemoveLast(tmpS)) { tmpS = tmpS.Substring(0, tmpS.Length - 1); } termStr = "between " + parseNumber(splited[i + 1]) + " and " + parseNumber(tmpS); i = i + 4; addToDic(termStr, docData.Name); } //check if the word and those after it are contain only capital letters and store all consecutive at one term else if (!chopedLast && i < splited.Length && word.Length > 1 && (word.All(char.IsUpper))) { handleCapitalLetters(docData, ref word, ref i); } //check if the word starts with $ else if (word[0] == '$') { handleDollarAtStart(docData, ref word, ref i, ref termStr); } //check if there is an hyphen else if (word.Contains('-') && !word.Contains("--") && word[0] != '-' && word[word.Length - 1] != '-') { termStr = handleHyphen(word, ref i, ref chopedLast); //store the new term in the parser's dictionary addToDic(termStr.ToLower(), docData.Name); } //check if the wod is a fraction else if (isAfrac(word)) { termStr = word; i++; addToDic(termStr, docData.Name); } //check if the word is number else if (isNumber(word)) { termStr = handleNumber(docData, word, ref i, ref chopedLast); } //check if the word contain only letters else if (word.All(char.IsLetter)) { word = word.ToLower(); if (!stopWords.Contains(word)) { termStr = word;// + "#\n"; addToDic(termStr.ToLower(), docData.Name); } i++; } //check if the word is a stopword else if (stopWords.Contains(word.ToLower())) { i++; } //if the word didnt match any rule add it to the dictionary else if (!parsed) { termStr = word; addToDic(termStr.ToLower(), docData.Name); i++; } } //check if there is need to stem if (stemBool == true) { Dictionary <string, int> termsDicStemer = new Dictionary <string, int>(); foreach (string term in termsDic.Keys) { string stem = stemmer.stemTerm(term); if (termsDicStemer.ContainsKey(stem)) { termsDicStemer[stem] = termsDicStemer[stem] + termsDic[term]; } else { termsDicStemer.Add(stem, termsDic[term]); } } //return the dictionary after stemminfg return(termsDicStemer); } docInfo[docData.Name].max_tf = maxFrecInt; //return the terms' dictionary return(termsDic); }
public void strToLowerCase(string s, int fa) { string str = s.ToLower().Trim('-').TrimStart(new char[] { ' ' }).TrimEnd(' ').Replace("/", ""); if (str.Length != 0) { str = str.Trim(' '); if (stop_words.Contains(str)) { return; } double doub = 0; if (s.Contains(".") && Double.TryParse(s, out doub)) { doub = System.Math.Round(doub, 2); string stri = "" + doub; if (!termPerDoc.ContainsKey(stri)) { termPerDoc.Add(stri, new termInfo(1, fa)); } else { termPerDoc[stri].tf++; if (termPerDoc[stri].tf > maxtf) { maxtf = termPerDoc[stri].tf; maxterm = stri; } } return; } if (str.Contains("-")) { splitslash(str, fa); return; } if (prefix.ContainsKey(str)) { str = prefix[str]; } if (toStem) { if (stemmers.ContainsKey(str)) { str = stemmers[str]; } else { stemmers.Add(str, stem.stemTerm(str)); str = stemmers[str]; } } if (!termPerDoc.ContainsKey(str)) { termPerDoc.Add(str, new termInfo(1, fa)); } else { termPerDoc[str].tf++; if (termPerDoc[str].tf > maxtf) { maxtf = termPerDoc[str].tf; maxterm = str; } } } }