Пример #1
0
        /// <summary>
        /// this method efficiently merge the newly created sorted files into posting files and
        /// creates the index in the desired index path
        /// </summary>
        /// <param name="fileCount">number of files</param>
        /// <param name="files">files array</param>
        private void merge(int fileCount, string[] files)
        {
            bool MoreToRead = false;

            string[]       firstLines = new string[fileCount];
            string[]       sortedFirstLines;
            StreamReader[] sr = new StreamReader[fileCount];
            Dictionary <string, StreamWriter> writers = new Dictionary <string, StreamWriter>();

            for (int i = 0; i < fileCount; i++)
            {
                sr[i] = new StreamReader(path + "\\index" + i + "sorted.txt");
            }
            for (char c = 'A'; c <= 'Z'; c++)
            {
                writers.Add(c + "", new StreamWriter(ipath + "\\" + c + ".txt"));
            }
            foreach (term.Type t in Enum.GetValues(typeof(term.Type)))
            {
                if (t == term.Type.word)
                {
                    continue;
                }
                writers.Add(t.ToString(), new StreamWriter(ipath + "\\" + t.ToString() + ".txt"));
            }
            writers.Add("other", new StreamWriter(ipath + "\\other.txt"));
            writers.Add("index", new StreamWriter(ipath + "\\index.txt"));
            for (int i = 0; i < fileCount; i++)
            {
                firstLines[i] = sr[i].ReadLine();
                if (firstLines[i] == null)
                {
                    firstLines[i] = "\0";
                }
                if (firstLines[i] == "")
                {
                    i--;
                }
                if (firstLines[i] != null)
                {
                    MoreToRead = true;
                }
            }
            int           lastIndex = 0;
            StringBuilder minLine   = new StringBuilder();
            StringBuilder minPhrase = new StringBuilder();

            while (lastIndex < firstLines.Length)
            {
                minPhrase.Clear();
                minLine.Clear();
                int i = lastIndex;
                sortedFirstLines = firstLines.OrderBy(s => s.Split(new string[] { "\t" }, StringSplitOptions.None)[0]).ToArray();
                for (i = 0; i < sortedFirstLines.Length; i++)
                {
                    if (sortedFirstLines[i].Equals("\0"))
                    {
                        continue;
                    }
                    else
                    {
                        break;
                    }
                }
                if (i >= sortedFirstLines.Length)
                {
                    break;
                }
                minPhrase.Append(GetPhrase(sortedFirstLines[i]));
                bool      Cap = true;
                double    icf = 0, idf = 0;
                term.Type type = GetType(sortedFirstLines[i]);
                for (i = 0; i < sortedFirstLines.Length; i++)
                {
                    if (sortedFirstLines[i].Equals("\0"))
                    {
                        continue;
                    }
                    if (string.Compare(minPhrase.ToString(), GetPhrase(sortedFirstLines[i]), true) == 0)
                    {
                        string[] splitted = sortedFirstLines[i].Split('\t');
                        if (type != (term.Type)Enum.Parse(typeof(term.Type), splitted[2], true))
                        {
                            continue;
                        }
                        Cap &= splitted[1].Equals("T") ? true : false;
                        minLine.Append(splitted[3]);
                        icf += double.Parse(splitted[4]);
                        idf += double.Parse(splitted[5]);
                    }
                    else
                    {
                        break;
                    }
                }
                string termPhrase = "";
                if (Cap)
                {
                    termPhrase = minPhrase.ToString().ToUpper();
                }
                else
                {
                    termPhrase = minPhrase.ToString().ToLower();
                }
                if (type == term.Type.word)
                {
                    if (char.IsLetter(minPhrase[0]))
                    {
                        writers[Char.ToUpper(minPhrase[0]).ToString()].WriteLine(termPhrase + "\t" + minLine.ToString());
                        if (Cap)
                        {
                            q.Enqueue(termPhrase + "\t" + minLine.ToString());
                            q_list.Release(1);
                        }
                    }
                    else
                    {
                        writers["other"].WriteLine(termPhrase + "\t" + minLine.ToString());
                    }
                }
                else
                {
                    writers[type.ToString()].WriteLine(termPhrase + "\t" + minLine.ToString());
                }
                for (i = 0; i < fileCount; i++)
                {
                    if (string.Compare(minPhrase.ToString(), GetPhrase(firstLines[i]), true) == 0)
                    {
                        if (type != GetType(firstLines[i]))
                        {
                            continue;
                        }
                        if (firstLines[i].Equals("\0"))
                        {
                            continue;
                        }
                        firstLines[i] = sr[i].ReadLine();
                        if (firstLines[i] == null)
                        {
                            firstLines[i] = "\0";
                        }
                        while (firstLines[i] == "")
                        {
                            firstLines[i] = sr[i].ReadLine();
                            if (firstLines[i] == null)
                            {
                                firstLines[i] = "\0";
                            }
                        }
                    }
                }
                writers["index"].WriteLine(termPhrase + "\t" + (int)type + "\t" + icf + "\t" + idf);
            }
            for (int i = 0; i < fileCount; i++)
            {
                sr[i].Close();
                File.Delete(path + "\\index" + i + "sorted.txt");
            }
            foreach (KeyValuePair <string, StreamWriter> entry in writers)
            {
                entry.Value.Close();
            }
        }
Пример #2
0
 public override string ToString()
 {
     return(phrase + '\t' + type.ToString());
 }