Exemplo n.º 1
0
        /// <summary>
        /// function as a getter to the type of the term
        /// </summary>
        /// <param name="line">the line of the term toString</param>
        /// <returns>the type</returns>
        private term.Type GetType(string line)
        {
            string type = line.Split('\t')[2];

            term.Type e = (term.Type)Enum.Parse(typeof(term.Type), type, true);
            return(e);
        }
Exemplo n.º 2
0
        public string CreatePhrase(string[] words, int idx, out int j, out term.Type type)
        {
            type = term.Type.time;
            string output = null;

            if (idx + 1 < words.Length)
            {
                if (times.Contains(words[idx + 1]))
                {
                    output = words[idx] + " " + words[idx + 1];
                    j      = idx + 1;
                    return(output);
                }
            }
            j      = idx + 2;
            output = words[idx] + " " + words[idx + 1] + " " + words[idx + 2];
            return(output);
        }
Exemplo n.º 3
0
 public string CreatePhrase(string[] words, int idx, out int j, out term.Type type)
 {
     j    = idx + 1;
     type = term.Type.distance;
     return(words[idx] + " " + words[idx + 1]);
 }
Exemplo n.º 4
0
        /// <summary>
        /// this method efficiently merge the newly created sorted files into posting files and
        /// creates the index in the desired index path
        /// </summary>
        /// <param name="fileCount">number of files</param>
        /// <param name="files">files array</param>
        private void merge(int fileCount, string[] files)
        {
            bool MoreToRead = false;

            string[]       firstLines = new string[fileCount];
            string[]       sortedFirstLines;
            StreamReader[] sr = new StreamReader[fileCount];
            Dictionary <string, StreamWriter> writers = new Dictionary <string, StreamWriter>();

            for (int i = 0; i < fileCount; i++)
            {
                sr[i] = new StreamReader(path + "\\index" + i + "sorted.txt");
            }
            for (char c = 'A'; c <= 'Z'; c++)
            {
                writers.Add(c + "", new StreamWriter(ipath + "\\" + c + ".txt"));
            }
            foreach (term.Type t in Enum.GetValues(typeof(term.Type)))
            {
                if (t == term.Type.word)
                {
                    continue;
                }
                writers.Add(t.ToString(), new StreamWriter(ipath + "\\" + t.ToString() + ".txt"));
            }
            writers.Add("other", new StreamWriter(ipath + "\\other.txt"));
            writers.Add("index", new StreamWriter(ipath + "\\index.txt"));
            for (int i = 0; i < fileCount; i++)
            {
                firstLines[i] = sr[i].ReadLine();
                if (firstLines[i] == null)
                {
                    firstLines[i] = "\0";
                }
                if (firstLines[i] == "")
                {
                    i--;
                }
                if (firstLines[i] != null)
                {
                    MoreToRead = true;
                }
            }
            int           lastIndex = 0;
            StringBuilder minLine   = new StringBuilder();
            StringBuilder minPhrase = new StringBuilder();

            while (lastIndex < firstLines.Length)
            {
                minPhrase.Clear();
                minLine.Clear();
                int i = lastIndex;
                sortedFirstLines = firstLines.OrderBy(s => s.Split(new string[] { "\t" }, StringSplitOptions.None)[0]).ToArray();
                for (i = 0; i < sortedFirstLines.Length; i++)
                {
                    if (sortedFirstLines[i].Equals("\0"))
                    {
                        continue;
                    }
                    else
                    {
                        break;
                    }
                }
                if (i >= sortedFirstLines.Length)
                {
                    break;
                }
                minPhrase.Append(GetPhrase(sortedFirstLines[i]));
                bool      Cap = true;
                double    icf = 0, idf = 0;
                term.Type type = GetType(sortedFirstLines[i]);
                for (i = 0; i < sortedFirstLines.Length; i++)
                {
                    if (sortedFirstLines[i].Equals("\0"))
                    {
                        continue;
                    }
                    if (string.Compare(minPhrase.ToString(), GetPhrase(sortedFirstLines[i]), true) == 0)
                    {
                        string[] splitted = sortedFirstLines[i].Split('\t');
                        if (type != (term.Type)Enum.Parse(typeof(term.Type), splitted[2], true))
                        {
                            continue;
                        }
                        Cap &= splitted[1].Equals("T") ? true : false;
                        minLine.Append(splitted[3]);
                        icf += double.Parse(splitted[4]);
                        idf += double.Parse(splitted[5]);
                    }
                    else
                    {
                        break;
                    }
                }
                string termPhrase = "";
                if (Cap)
                {
                    termPhrase = minPhrase.ToString().ToUpper();
                }
                else
                {
                    termPhrase = minPhrase.ToString().ToLower();
                }
                if (type == term.Type.word)
                {
                    if (char.IsLetter(minPhrase[0]))
                    {
                        writers[Char.ToUpper(minPhrase[0]).ToString()].WriteLine(termPhrase + "\t" + minLine.ToString());
                        if (Cap)
                        {
                            q.Enqueue(termPhrase + "\t" + minLine.ToString());
                            q_list.Release(1);
                        }
                    }
                    else
                    {
                        writers["other"].WriteLine(termPhrase + "\t" + minLine.ToString());
                    }
                }
                else
                {
                    writers[type.ToString()].WriteLine(termPhrase + "\t" + minLine.ToString());
                }
                for (i = 0; i < fileCount; i++)
                {
                    if (string.Compare(minPhrase.ToString(), GetPhrase(firstLines[i]), true) == 0)
                    {
                        if (type != GetType(firstLines[i]))
                        {
                            continue;
                        }
                        if (firstLines[i].Equals("\0"))
                        {
                            continue;
                        }
                        firstLines[i] = sr[i].ReadLine();
                        if (firstLines[i] == null)
                        {
                            firstLines[i] = "\0";
                        }
                        while (firstLines[i] == "")
                        {
                            firstLines[i] = sr[i].ReadLine();
                            if (firstLines[i] == null)
                            {
                                firstLines[i] = "\0";
                            }
                        }
                    }
                }
                writers["index"].WriteLine(termPhrase + "\t" + (int)type + "\t" + icf + "\t" + idf);
            }
            for (int i = 0; i < fileCount; i++)
            {
                sr[i].Close();
                File.Delete(path + "\\index" + i + "sorted.txt");
            }
            foreach (KeyValuePair <string, StreamWriter> entry in writers)
            {
                entry.Value.Close();
            }
        }
Exemplo n.º 5
0
 public indexTerm(string phrase, term.Type type)
 {
     this.phrase = phrase;
     this.type   = type;
 }