예제 #1
0
        /// <summary>
        /// Parses text files and generate a list of list of words.
        /// </summary>
        /// <returns>A list of segment which is in turn a list of words.</returns>
        public static List <List <string> > ParseData(TupleList <string, string> dataFiles, bool writeOut = true)
        {
            // The parsing algorithm is as follows:
            // 1. Read 1 line at a time and ignore all whitespace or skip if it's a white space only line
            // 2. If this line contains more than a certain number of html characters such as &# then skip it
            // 3. If present, extract groups inside parentheses out as separate sentences since they often are self-sufficient
            // 4. Break the line into different segments through characters . , ; : ! ?
            //      - An issue with the , . characters is that numbers use them as well, e.g. 7,5 or 40.000.
            //      - Another issue with the dot character is that it can be used as ellipsis during part of a sentence. For example: Là ngày... em sẽ xa ta luôn.
            //      - Characters such as ! and ? don't really affect word meanings and often signal end of phrase
            // 5. Remove empty or white-space segments
            // 6. Break each segment into a list of words with white-space separators.
            //      - Note that white-space separators here include more than just the space character (ascii code 32)
            //        but can also include \t or html white-space character such as &nbsp; etc...
            // 7. Remove words that are only characters such as * >
            // 8. Remove quote characters ' " ” (8221) “ (8220) from words since they normally only serve as emphasis functions

            string[]         ignores           = { "&#" };
            char[]           quotes            = { '\'', '"', '“', '”' };
            char[]           segmentSeparators = { ',', ';', ':', '.', '!', '?' };
            HashSet <string> removeSet         = new HashSet <string>(new string[] { "*", ">" });

            VietConverter         converter         = new VietConverter();
            List <List <string> > globalSegmentList = new List <List <string> >();

            foreach (var tuple in dataFiles)
            {
                var textFiles = Directory.EnumerateFiles(tuple.Item1, tuple.Item2, SearchOption.AllDirectories);
                foreach (var textFile in textFiles)
                {
                    Console.WriteLine(textFile);
                    using (StreamReader sr = new StreamReader(File.OpenRead(textFile)))
                    {
                        while (sr.EndOfStream == false)
                        {
                            string line = sr.ReadLine().Trim();
                            // Ignore white-space strings
                            if (!String.IsNullOrWhiteSpace(line))
                            {
                                bool ignore = false;
                                // Ignore strings that contain invalid characters such as html characters
                                foreach (string ignorePattern in ignores)
                                {
                                    if (line.Contains(ignorePattern))
                                    {
                                        ignore = true;
                                        break;
                                    }
                                }
                                if (ignore)
                                {
                                    continue;
                                }

                                // Extract parentheses groups from current string
                                var groups = TextParser.ExtractParentheses(line);

                                foreach (string group in groups)
                                {
                                    if (!String.IsNullOrWhiteSpace(group)) // Make sure once again that the groups aren't white-space only
                                    {
                                        // Break each group into segments
                                        string[] segmentArray = group.Split(segmentSeparators, StringSplitOptions.RemoveEmptyEntries);
                                        foreach (string segment in segmentArray)
                                        {
                                            List <int> wordList = new List <int>();

                                            bool skipSegment = false;

                                            // Break each segment into words
                                            List <string> normSegment = new List <string>();

                                            string[] wordArray = segment.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
                                            if (wordArray.Length > 1)
                                            {
                                                foreach (string word in wordArray)
                                                {
                                                    string normWord = word;

                                                    // Make sure the word is not white-space only
                                                    if (!removeSet.Contains(normWord))
                                                    {
                                                        // Remove quote characters
                                                        foreach (var quote in quotes)
                                                        {
                                                            normWord = normWord.Replace(quote.ToString(), "");
                                                        }
                                                        normWord = normWord.Trim().ToLower();

                                                        if (!String.IsNullOrWhiteSpace(normWord))
                                                        {
                                                            normSegment.Add(normWord);
                                                        }
                                                    }
                                                    else
                                                    {
                                                        skipSegment = true;
                                                        break;
                                                    }
                                                }
                                            }

                                            if (!skipSegment && normSegment.Count > 1)
                                            {
                                                globalSegmentList.Add(normSegment);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(globalSegmentList);
        }
예제 #2
0
        public AccentConverter()
        {
            string rawChars = this.GetRawChars();

            var lettersTup = this.GetAccentToRawMapping();
            int count      = lettersTup.Count;

            for (int i = 0; i < count; i++)
            {
                lettersTup.Add(lettersTup[i].Item1.ToUpper(), lettersTup[i].Item2.ToString().ToUpper()[0]);
            }

            accentToRawCharMap = lettersTup.SelectMany(tup => tup.Item1
                                                       .Select(letter => new Tuple <char, char>(letter, tup.Item2)))
                                 .ToDictionary(x => x.Item1, x => x.Item2);


            TupleList <string, char> accCodeTup = this.GetAccentToCodeMapping();

#if DEBUG
            // Verify that accent codes do not contain duplicate
            var accentCodeSet = new Dictionary <char, byte>();
            accCodeTup.ForEach(tup => accentCodeSet.Add(tup.Item2, 0));
#endif
            count = accCodeTup.Count;
            for (int i = 0; i < count; i++)
            {
                accCodeTup.Add(accCodeTup[i].Item1.ToUpper(), accCodeTup[i].Item2);
            }
            accentToCodeMap = accCodeTup.SelectMany(tup => tup.Item1
                                                    .Select(letter => new Tuple <char, char>(letter, tup.Item2)))
                              .ToDictionary(x => x.Item1, x => x.Item2);

#if DEBUG
            // Verify that number of accented letters only differ by the number of raw letters aeuioydAEUIOYD
            Debug.Assert(accentToCodeMap.Count - 14 == accentToRawCharMap.Count);
#endif

            accentToAsciiMap = new Dictionary <char, int>();
            asciiToAccentMap = new Dictionary <int, char>();

            List <char> accentCharList = accentToRawCharMap.Keys.Select(a => Char.ToLower(a)).Distinct().ToList();
            accentCharList.AddRange(accentToRawCharMap.Values.Select(a => Char.ToLower(a)).Distinct());
            accentCharList.Sort();
            for (int i = 0; i < accentCharList.Count; i++)
            {
                asciiToAccentMap.Add(i + 33, accentCharList[i]);
                accentToAsciiMap.Add(accentCharList[i], i + 33);
            }

            rawCharMap = new Dictionary <char, int>();
            for (int iTup = 0; iTup < lettersTup.Count; iTup++)
            {
                rawCharMap.Add(lettersTup[iTup].Item2, iTup);
            }

            rawAlphabetMap = new Dictionary <char, int>();
            for (int iChar = 0; iChar < rawChars.Length; iChar++)
            {
                rawAlphabetMap.Add(rawChars[iChar], iChar);
            }
        }