/// <summary> /// Simplifies an input string with various options. /// </summary> /// <param name="input">String to be simplified</param> /// <param name="removeFirst">Whether to always remove first word during simplification</param> /// <param name="removeLast">Whether to always remove last word during simplification</param> /// <param name="options">Selection of optional remove words</param> /// <param name="disableRemAfter">Disable removing of words that follow defined words to remove</param> /// <param name="wordSplitEn">Enables splitting words in the string using dictionary (e.g. "howimetyourmother" split to "how i met your mother"</param> /// <param name="removeWhitespace">Whether to remove extra whitespace</param> /// <returns>Simplified string results</returns> public static SimplifyStringResults BuildSimplifyResults(string input, bool removeFirst, bool removeLast, OptionalSimplifyRemoves options, bool disableRemAfter, bool wordSplitEn, bool removeWhitespace, bool removeBrackContents) { // All lowercase string simplifiedName = input.ToLower().Replace("&", "and"); // Initialize string modifications ContentSearchMod mods = ContentSearchMod.None; // Remove contents inside any brackets if (removeBrackContents) { simplifiedName = Regex.Replace(simplifiedName, @"\([^\)]*\)", " "); mods |= ContentSearchMod.BrackRemoval; } // Remove unneeded characters: ',!,?,(,),: simplifiedName = Regex.Replace(simplifiedName, @"[']+", ""); simplifiedName = Regex.Replace(simplifiedName, @"[!\?\u0028\u0029\:\]\[]+", " "); // Replace seperators with spaces if (removeWhitespace) { simplifiedName = Regex.Replace(simplifiedName, @"\W+|_", " "); } // Initialize removed words dictionary Dictionary <FileWordType, List <string> > removeFileWords = new Dictionary <FileWordType, List <string> >(); // Process each optional remove word for (int j = 0; j < OptionalRemoveWords.Length; j++) { if (((int)options & (int)Math.Pow(2, j)) > 0) { bool removed; simplifiedName = RemoveWord(disableRemAfter, simplifiedName, removeFileWords, OptionalRemoveWords[j], out removed); if (removed) { if ((OptionalSimplifyRemoves)j == OptionalSimplifyRemoves.Year || (OptionalSimplifyRemoves)j == OptionalSimplifyRemoves.YearAndFollowing) { mods |= ContentSearchMod.YearRemoved; } else { mods |= ContentSearchMod.WordsRemoved; } } } } // Process always remove words foreach (RemoveFileWord remWord in AlwaysRemoveWords) { simplifiedName = RemoveWord(disableRemAfter, simplifiedName, removeFileWords, remWord); } // Remove first word if (removeFirst) { Match firstWordMatch = Regex.Match(simplifiedName, @"^\W*\w+"); if (firstWordMatch.Success) { simplifiedName = simplifiedName.Remove(firstWordMatch.Index, firstWordMatch.Length); } mods |= ContentSearchMod.WordsRemoved; } // Remove Last word if (removeLast) { Match lastWordMatch = Regex.Match(simplifiedName, @"(\w+\W*)$"); if (lastWordMatch.Success) { simplifiedName = simplifiedName.Remove(lastWordMatch.Index, lastWordMatch.Length); } mods |= ContentSearchMod.WordsRemoved; } //// Don't allow removal of both first and last words //else if (removeFirst && removeLast) // return null; // Word splitting if (wordSplitEn) { // Seperate input by whitespace string[] words = simplifiedName.Split(' '); // Build new string with words split up bool split = false; simplifiedName = string.Empty; foreach (string word in words) { string newWord; if (WordHelper.TrySplitWords(word, out newWord)) { split = true; } simplifiedName += newWord + " "; } if (split) { mods |= ContentSearchMod.WordSlit; } } // Trim simplifiedName = simplifiedName.Trim().Replace(" ", " "); return(new SimplifyStringResults(simplifiedName, removeFileWords, mods)); }
/// <summary> /// Creates of list of simplified strings from an input string (multiple results created from enabling various optional word removals). /// </summary> /// <param name="input">String to be simplified</param> /// <returns>List of simplified string results</returns> public static List <SimplifyStringResults> SimplifyString(string input) { // Create list of simplified strings List <SimplifyStringResults> simpliedStrings = new List <SimplifyStringResults>(); // Set number of optional combinations for simplifying string with int optionCombinations = (int)Math.Pow(2, OptionalRemoveWords.Length + 2); // Loop twice: with and without word splitting for (int i = 0; i < 2; i++) { // Go through all combinations of optional removes for (int j = 0; j < optionCombinations; j++) { // With and without breack content removal for (int k = 0; k < 2; k++) { // Build options OptionalSimplifyRemoves options = (OptionalSimplifyRemoves)(j >> 2); // Don't do both year removes if ((options & OptionalSimplifyRemoves.Year) > 0 && (options & OptionalSimplifyRemoves.YearAndFollowing) > 0) { continue; } // Get results bool removeFirst = (j & 1) > 0; bool removeLast = (j & 2) > 0; SimplifyStringResults simpleRes = BuildSimplifyResults(input, removeFirst, removeLast, options, false, i == 1, true, k == 1); // Don't allow result that is only the year if (Regex.IsMatch(simpleRes.SimplifiedString, @"^(19|20)\d{2}$") && !simpleRes.RemovedWords.ContainsKey(FileWordType.Year)) { continue; } // Don't let common single words through if (!simpleRes.SimplifiedString.Contains(' ') && simpleRes.SimplifiedString.Length < 3 && WordHelper.IsWord(simpleRes.SimplifiedString)) { continue; } // Add to list of simplified strings bool exists = false; foreach (SimplifyStringResults simplifyRes in simpliedStrings) { if (simplifyRes.SimplifiedString == simpleRes.SimplifiedString) { exists = true; break; } } // Check that simplification doesn't already exist! if (!exists && !string.IsNullOrEmpty(simpleRes.SimplifiedString)) { simpliedStrings.Add(simpleRes); } } } } return(simpliedStrings); }