/// <summary> /// Parses text files and generate a list of list of words. /// </summary> /// <returns>A list of segment which is in turn a list of words.</returns> public static List <List <string> > ParseData(TupleList <string, string> dataFiles, bool writeOut = true) { // The parsing algorithm is as follows: // 1. Read 1 line at a time and ignore all whitespace or skip if it's a white space only line // 2. If this line contains more than a certain number of html characters such as &# then skip it // 3. If present, extract groups inside parentheses out as separate sentences since they often are self-sufficient // 4. Break the line into different segments through characters . , ; : ! ? // - An issue with the , . characters is that numbers use them as well, e.g. 7,5 or 40.000. // - Another issue with the dot character is that it can be used as ellipsis during part of a sentence. For example: Là ngày... em sẽ xa ta luôn. // - Characters such as ! and ? don't really affect word meanings and often signal end of phrase // 5. Remove empty or white-space segments // 6. Break each segment into a list of words with white-space separators. // - Note that white-space separators here include more than just the space character (ascii code 32) // but can also include \t or html white-space character such as etc... // 7. Remove words that are only characters such as * > // 8. Remove quote characters ' " ” (8221) “ (8220) from words since they normally only serve as emphasis functions string[] ignores = { "&#" }; char[] quotes = { '\'', '"', '“', '”' }; char[] segmentSeparators = { ',', ';', ':', '.', '!', '?' }; HashSet <string> removeSet = new HashSet <string>(new string[] { "*", ">" }); VietConverter converter = new VietConverter(); List <List <string> > globalSegmentList = new List <List <string> >(); foreach (var tuple in dataFiles) { var textFiles = Directory.EnumerateFiles(tuple.Item1, tuple.Item2, SearchOption.AllDirectories); foreach (var textFile in textFiles) { Console.WriteLine(textFile); using (StreamReader sr = new StreamReader(File.OpenRead(textFile))) { while (sr.EndOfStream == false) { string line = sr.ReadLine().Trim(); // Ignore white-space strings if (!String.IsNullOrWhiteSpace(line)) { bool ignore = false; // Ignore strings that contain invalid characters such as html characters foreach (string ignorePattern in ignores) { if (line.Contains(ignorePattern)) { ignore = true; break; } } if (ignore) { continue; } // Extract parentheses groups from current string var groups = TextParser.ExtractParentheses(line); foreach (string group in groups) { if (!String.IsNullOrWhiteSpace(group)) // Make sure once again that the groups aren't white-space only { // Break each group into segments string[] segmentArray = group.Split(segmentSeparators, StringSplitOptions.RemoveEmptyEntries); foreach (string segment in segmentArray) { List <int> wordList = new List <int>(); bool skipSegment = false; // Break each segment into words List <string> normSegment = new List <string>(); string[] wordArray = segment.Split(new char[0], StringSplitOptions.RemoveEmptyEntries); if (wordArray.Length > 1) { foreach (string word in wordArray) { string normWord = word; // Make sure the word is not white-space only if (!removeSet.Contains(normWord)) { // Remove quote characters foreach (var quote in quotes) { normWord = normWord.Replace(quote.ToString(), ""); } normWord = normWord.Trim().ToLower(); if (!String.IsNullOrWhiteSpace(normWord)) { normSegment.Add(normWord); } } else { skipSegment = true; break; } } } if (!skipSegment && normSegment.Count > 1) { globalSegmentList.Add(normSegment); } } } } } } } } } return(globalSegmentList); }
public AccentConverter() { string rawChars = this.GetRawChars(); var lettersTup = this.GetAccentToRawMapping(); int count = lettersTup.Count; for (int i = 0; i < count; i++) { lettersTup.Add(lettersTup[i].Item1.ToUpper(), lettersTup[i].Item2.ToString().ToUpper()[0]); } accentToRawCharMap = lettersTup.SelectMany(tup => tup.Item1 .Select(letter => new Tuple <char, char>(letter, tup.Item2))) .ToDictionary(x => x.Item1, x => x.Item2); TupleList <string, char> accCodeTup = this.GetAccentToCodeMapping(); #if DEBUG // Verify that accent codes do not contain duplicate var accentCodeSet = new Dictionary <char, byte>(); accCodeTup.ForEach(tup => accentCodeSet.Add(tup.Item2, 0)); #endif count = accCodeTup.Count; for (int i = 0; i < count; i++) { accCodeTup.Add(accCodeTup[i].Item1.ToUpper(), accCodeTup[i].Item2); } accentToCodeMap = accCodeTup.SelectMany(tup => tup.Item1 .Select(letter => new Tuple <char, char>(letter, tup.Item2))) .ToDictionary(x => x.Item1, x => x.Item2); #if DEBUG // Verify that number of accented letters only differ by the number of raw letters aeuioydAEUIOYD Debug.Assert(accentToCodeMap.Count - 14 == accentToRawCharMap.Count); #endif accentToAsciiMap = new Dictionary <char, int>(); asciiToAccentMap = new Dictionary <int, char>(); List <char> accentCharList = accentToRawCharMap.Keys.Select(a => Char.ToLower(a)).Distinct().ToList(); accentCharList.AddRange(accentToRawCharMap.Values.Select(a => Char.ToLower(a)).Distinct()); accentCharList.Sort(); for (int i = 0; i < accentCharList.Count; i++) { asciiToAccentMap.Add(i + 33, accentCharList[i]); accentToAsciiMap.Add(accentCharList[i], i + 33); } rawCharMap = new Dictionary <char, int>(); for (int iTup = 0; iTup < lettersTup.Count; iTup++) { rawCharMap.Add(lettersTup[iTup].Item2, iTup); } rawAlphabetMap = new Dictionary <char, int>(); for (int iChar = 0; iChar < rawChars.Length; iChar++) { rawAlphabetMap.Add(rawChars[iChar], iChar); } }