Beispiel #1
0
        static void Main(string[] args)
        {
            StringBuilder srcdirpath      = new StringBuilder(256);
            StringBuilder indexdirpath    = new StringBuilder(256);
            StringBuilder indexdirtmppath = new StringBuilder(256);

            Dictionary <string, int>         uniqueTermsDict  = new Dictionary <string, int>();
            Dictionary <string, int>         dfTermsDict      = new Dictionary <string, int>();
            Dictionary <string, double>      idfTermsDict     = new Dictionary <string, double>();
            List <Dictionary <string, int> > termDictPerInput = new List <Dictionary <string, int> >();
            List <int> fileSizes = new List <int>();

            StreamWriter  output;
            List <string> stopWords = new List <string>();
            DateTime      start     = DateTime.Now;

            // Process Input Parameters
            if (args.Length != 2)
            {
                System.Console.WriteLine("Invalid Parameters");
                return;
            }
            else
            {
                srcdirpath.Append(args[0]);
                indexdirpath.Append(args[1]);
            }

            // Open up the Input Directories
            #region DirectoryHandling
            DirectoryInfo srcdirinfo   = new DirectoryInfo(srcdirpath.ToString());
            DirectoryInfo indexdirinfo = new DirectoryInfo(indexdirpath.ToString());

            indexdirtmppath.Append(indexdirpath.ToString() + "/tmp");

            DirectoryInfo indexdirtmpinfo = new DirectoryInfo(indexdirtmppath.ToString());

            if (!srcdirinfo.Exists)
            {
                System.Console.WriteLine("Directory does not appear to exist");
                return;
            }

            if (!indexdirinfo.Exists)
            {
                System.Console.WriteLine("Creating /index directory");
                indexdirinfo.Create();
            }

            if (!indexdirtmpinfo.Exists)
            {
                System.Console.WriteLine("Creating /tmp directory");
                indexdirtmpinfo.Create();
            }
            #endregion

            // Glob in all stop words
            #region StopWords
            // We're going to currently assume the stopwords.txt file is with the src files
            List <FileInfo> stopFileList = srcdirinfo.GetFiles("stoplist*").ToList();
            foreach (FileInfo file in stopFileList)
            {
                //its, lets, well, were are not duplicated
                HTMLParser stopW           = new HTMLParser(file.FullName);
                Dictionary <string, int> s = new Dictionary <string, int>(stopW.tokenize());

                foreach (string str in s.Keys)
                {
                    stopWords.Add(str);
                }
            }

            #endregion

            // Iterate over all the HTML Files from the Source Directory
            #region IterateOverInputFiles
            int             fileIndex   = 0;
            List <FileInfo> srcfilelist = srcdirinfo.GetFiles("*.html").ToList();
            foreach (FileInfo file in srcfilelist)
            {
                HTMLParser    fileTerms = new HTMLParser(file.FullName);
                StringBuilder newName   = new StringBuilder(file.Name);
                fileSizes.Add(0);

                newName.Replace("html", "txt");
                StringBuilder outputpath = new StringBuilder(indexdirtmpinfo.FullName + "\\" + newName.ToString());

                if (File.Exists(outputpath.ToString()))
                {
                    File.Delete(outputpath.ToString());
                }

                // Parse the File (the output is a set of terms)
                Dictionary <string, int> localTermDict = new Dictionary <string, int>(fileTerms.tokenize());
                Dictionary <string, int> cleanTermDict = new Dictionary <string, int>();
                // have to use separate because it doesn't like it when i edit the primary while looping

                // Write out results to tmp files
                output = new StreamWriter(outputpath.ToString(), false);

                IOrderedEnumerable <KeyValuePair <string, int> > termssortedhashlist =
                    (from entry in localTermDict orderby entry.Key ascending select entry);
                foreach (KeyValuePair <string, int> k in termssortedhashlist)
                {
                    fileSizes[fileIndex] += k.Value;

                    if (!stopWords.Contains(k.Key))
                    {
                        output.WriteLine(k.Key + " : " + k.Value);

                        cleanTermDict.Add(k.Key, k.Value);

                        // Calculate the DF for each Term
                        if (dfTermsDict.ContainsKey(k.Key))
                        {
                            // indicating we found it in another doc
                            dfTermsDict[k.Key] += 1;
                        }
                        else
                        {
                            dfTermsDict.Add(k.Key, 1);
                        }

                        if (uniqueTermsDict.ContainsKey(k.Key))
                        {
                            uniqueTermsDict[k.Key] += k.Value;
                        }
                        else
                        {
                            uniqueTermsDict.Add(k.Key, k.Value);
                        }
                    }
                }
                output.Close();

                fileIndex++;
                termDictPerInput.Add(cleanTermDict);
            } // end foreach input file
            #endregion

            // fileSizes has the total number of terms for each input file (including stop words, etc)
            // termDictPerInput has the total terms and their frequencies per document (in document order)
            // dfTermsDict has all the document frequencies for each term
            // uniqueTerms has all the terms and their frequencies

            // Remove wors which only appear once in the corpus
            #region CleanTermsLists

            IEnumerable <KeyValuePair <string, int> > reallyUniqueTerms =
                (from entry in uniqueTermsDict where entry.Value == 1 select entry);

            foreach (KeyValuePair <string, int> k in reallyUniqueTerms)
            {
                dfTermsDict.Remove(k.Key);

                foreach (Dictionary <string, int> d in termDictPerInput)
                {
                    d.Remove(k.Key);
                }
            }

            #endregion

            // Computer the IDF for each term in one pass
            #region ComputeIDF
            int totalDoc = termDictPerInput.Count;
            foreach (KeyValuePair <string, int> k in dfTermsDict)
            {
                idfTermsDict.Add(k.Key, Math.Log10((double)totalDoc / (double)k.Value));
            }
            #endregion

            // Set up for end of processing print out
            #region RemainingDataProcessing

            int dictIndex = 0;
            foreach (FileInfo file in srcfilelist)
            {
                StringBuilder newName = new StringBuilder(file.Name);
                newName.Replace("html", "txt");

                StringBuilder outputpath = new StringBuilder(indexdirinfo.FullName + "\\" + newName.ToString());

                if (File.Exists(outputpath.ToString()))
                {
                    File.Delete(outputpath.ToString());
                }

                // Write out results to Permanent Files
                output = new StreamWriter(outputpath.ToString(), false);

                IOrderedEnumerable <KeyValuePair <string, int> > sortedTermsbyAlpha =
                    (from entry in termDictPerInput[dictIndex] orderby entry.Key ascending select entry);

                foreach (KeyValuePair <string, int> k in sortedTermsbyAlpha)
                {
                    // Normalize Term Weights
                    double normWeight = (double)k.Value / (double)fileSizes[dictIndex];
                    double termWeight = normWeight * idfTermsDict[k.Key];

                    output.WriteLine(k.Key + " : " + termWeight.ToString());
                }
                output.Close();

                dictIndex++;
            }

            //totaltermlist.Sort(
            //    delegate(KeyValuePair<string, string> firstPair, KeyValuePair<string, string> nextPair)
            //    {
            //        return firstPair.Value.CompareTo(nextPair.Value);
            //    }
            //);
            //totaltermlist.Sort((firstPair, nextPair) =>
            //{
            //    return firstPair.Value.CompareTo(nextPair.Value);
            //}
            //);
            #endregion

            DateTime stop = DateTime.Now;
            Console.WriteLine("Total time: " + (stop - start).ToString());
        }
Beispiel #2
0
        static void Main(string[] args)
        {
            StringBuilder srcdirpath     = new StringBuilder(256);
            StringBuilder indexdirpath   = new StringBuilder(256);
            StringBuilder queryTermspath = new StringBuilder(256);

            // globalTermsDict is the set of all terms and their global frequencies
            Dictionary <string, int> globalTermsDict = new Dictionary <string, int>();

            // idfTermsDict to calculated the idf for each term
            Dictionary <string, double> idfTermsDict = new Dictionary <string, double>();

            // invertedTermIndex will tell us which documents have which term, by term
            // NOTE: Awesomely enough using this to find occurences cut the time to process 503 files by more than half
            // Before we were going through each document asking if it contained the term; now we just know a priori
            // Also used to calculate document frequencies
            Dictionary <string, List <string> > invertedTermIndex = new Dictionary <string, List <string> >();

            // termsPerDoc is a list of term hashes for each document (term=>tf)
            Dictionary <string, Dictionary <string, int> > termsPerDoc = new Dictionary <string, Dictionary <string, int> >();

            // fileSizes holds the total count of all terms including stop words
            Dictionary <string, int> fileSizes = new Dictionary <string, int>();

            // stopWords is the set of stopwords
            List <string> stopWords = new List <string>();

            // queries is a set of Query objects, each represents a user query and the results
            List <Query> queries = new List <Query>();

            // dictionary: key is the term, the value is the df + index into postings file as a string
            Dictionary <string, string> memDictionary = new Dictionary <string, string>();

            // postingsCache: keys in the term, the value is the postings records for the term
            Dictionary <string, RecordSet> postingsCache = new Dictionary <string, RecordSet>();

            int recordSize = 22;

            StreamWriter output;
            StreamWriter postings;

            StreamReader dictFile;
            StreamReader queryFile;

            DateTime start = DateTime.Now;

            // Note: We now use a document ID to identify the hash list for each document
            //       and the size of each document.  This hash of hashes is really quick.
            //       I would imagine it is pointer to a pointer construction.  Very quick lookup.
            //       Timing:
            //       For redone Part 2
            //       With just changing the current codebase to use a hash lookup for each list of things
            //       Can now go through the 503 documents in 36 seconds, whereas before it was 60+

            // Process Input Parameters
            #region ProcessInput
            if (args.Length != 3)
            {
                System.Console.WriteLine("Invalid Parameters: <input directory> <index directory> <query file>");
                return;
            }
            else
            {
                srcdirpath.Append(args[0]);
                indexdirpath.Append(args[1]);
                queryTermspath.Append(args[2]);
            }
            #endregion

            // Open up the Input Directories
            #region DirectoryHandling
            DirectoryInfo srcdirinfo   = new DirectoryInfo(srcdirpath.ToString());
            DirectoryInfo indexdirinfo = new DirectoryInfo(indexdirpath.ToString());

            if (!srcdirinfo.Exists)
            {
                System.Console.WriteLine("Source directory does not appear to exist");
                return;
            }

            if (!indexdirinfo.Exists)
            {
                System.Console.WriteLine("Creating /index directory");
                indexdirinfo.Create();
            }

            //if (!indexdirtmpinfo.Exists)
            //{
            //    System.Console.WriteLine("Creating /tmp directory");
            //    indexdirtmpinfo.Create();
            //}
            #endregion

            // Glob in all stop words
            #region StopWords
            // We're going to currently assume the stopwords.txt file is with the src files
            List <FileInfo> stopFileList = srcdirinfo.GetFiles("stoplist*").ToList();
            foreach (FileInfo file in stopFileList)
            {
                //its, lets, well, were are not duplicated
                HTMLParser stopW           = new HTMLParser(file.FullName);
                Dictionary <string, int> s = new Dictionary <string, int>(stopW.tokenize());

                foreach (string str in s.Keys)
                {
                    stopWords.Add(str);
                }
            }

            #endregion

            // Iterate over all the HTML Files from the Source Directory
            #region IterateOverInputFiles
            List <FileInfo> srcfilelist = srcdirinfo.GetFiles("*.html").ToList();
            foreach (FileInfo file in srcfilelist)
            {
                HTMLParser    fileTerms = new HTMLParser(file.FullName);
                StringBuilder newName   = new StringBuilder(file.Name);

                newName.Replace("html", "txt");
                string docID = newName.Replace(".txt", "").ToString();
                fileSizes.Add(docID, 0);

                // Parse the File (the output is a set of terms)
                Dictionary <string, int> localTermDict = new Dictionary <string, int>(fileTerms.tokenize());
                Dictionary <string, int> cleanTermDict = new Dictionary <string, int>();
                // have to use separate because it doesn't like it when i edit the primary while looping

                IOrderedEnumerable <KeyValuePair <string, int> > termssortedhashlist =
                    (from entry in localTermDict orderby entry.Key ascending select entry);
                foreach (KeyValuePair <string, int> k in termssortedhashlist)
                {
                    fileSizes[docID] += k.Value;

                    if (!stopWords.Contains(k.Key))
                    {
                        cleanTermDict.Add(k.Key, k.Value);

                        // this will hold all terms and their global counts
                        // so we can track down singletons
                        if (globalTermsDict.ContainsKey(k.Key))
                        {
                            globalTermsDict[k.Key] += k.Value;
                        }
                        else
                        {
                            globalTermsDict.Add(k.Key, k.Value);
                        }

                        // this is our term document matrix in memory
                        if (invertedTermIndex.ContainsKey(k.Key))
                        {
                            invertedTermIndex[k.Key].Add(docID);
                        }
                        else
                        {
                            invertedTermIndex.Add(k.Key, new List <string>());
                            invertedTermIndex[k.Key].Add(docID);
                        }
                    }
                }

                termsPerDoc.Add(docID, cleanTermDict);
            } // end foreach input file
            #endregion

            // Remove wors which only appear once in the corpus
            #region CleanTermsLists

            IEnumerable <KeyValuePair <string, int> > reallyUniqueTerms =
                (from entry in globalTermsDict where entry.Value == 1 select entry);

            foreach (KeyValuePair <string, int> k in reallyUniqueTerms)
            {
                // we know which docs have this unique term, can eliminate quickly!
                foreach (string dID in invertedTermIndex[k.Key])
                {
                    termsPerDoc[dID].Remove(k.Key);
                }

                invertedTermIndex.Remove(k.Key);
            }
            #endregion

            // Computer the IDF for each term in one pass
            #region ComputeIDF
            int totalDoc = termsPerDoc.Count;
            Console.WriteLine("Computed " + totalDoc.ToString() + " input files");
            foreach (KeyValuePair <string, List <string> > k in invertedTermIndex)
            {
                idfTermsDict.Add(k.Key, Math.Log10((double)totalDoc / (double)k.Value.Count));
            }
            #endregion

            // NOTE: All terms are truncated down to 35 characters -- ADD TO REPORT
            //Console.WriteLine("Longest Term: " + maxStringLen.ToString());
            // NOTE: Also worth mentioning that I don't start a new token if it switches from a-z to 0-9, although that
            // might provide more interesting results.

            // Print out Dictionary
            #region BuildDictionaryAndPostingsFile
            // Each fixed length record contains:
            // word(35 bytes), df(5 bytes), location of first record in posting file(5 bytes)
            StringBuilder dictionarypath   = new StringBuilder(indexdirinfo.FullName + "\\dictionary.txt");
            StringBuilder postingsfilepath = new StringBuilder(indexdirinfo.FullName + "\\postings.txt");

            output   = new StreamWriter(dictionarypath.ToString(), false);
            postings = new StreamWriter(postingsfilepath.ToString(), false);

            int recordIndex = 0;

            // get sorted inverted term index
            IOrderedEnumerable <KeyValuePair <string, List <string> > > sortedTermsfromMatrix =
                (from entry in invertedTermIndex orderby entry.Key ascending select entry);

            foreach (KeyValuePair <string, List <string> > k in sortedTermsfromMatrix)
            {
                output.WriteLine(k.Key.PadRight(35) + ":" + k.Value.Count.ToString().PadLeft(5) + ":" + recordIndex.ToString().PadLeft(7));

                // instantly grab all occurences of the key and add to the postings file.
                foreach (string dID in k.Value)
                {
                    // so we know the docID of each document with the thing
                    double normWeight = (double)termsPerDoc[dID][k.Key] / (double)fileSizes[dID];
                    double termWeight = normWeight * idfTermsDict[k.Key];

                    string tmp = String.Format("{0:.######}", termWeight);

                    // works well if the max term weight is 9.999999
                    if (termWeight < 1.0)
                    {
                        tmp = tmp.Insert(0, "0");
                    }

                    postings.WriteLine(dID.PadRight(10) + ":" + tmp.PadRight(9, '0'));

                    recordIndex += 1;
                }
            }
            postings.Close();
            output.Close();
            #endregion

            // Free Memory Used to Build Index
            idfTermsDict.Clear();
            invertedTermIndex.Clear();
            globalTermsDict.Clear();
            termsPerDoc.Clear();
            fileSizes.Clear();

            //totaltermlist.Sort(
            //    delegate(KeyValuePair<string, string> firstPair, KeyValuePair<string, string> nextPair)
            //    {
            //        return firstPair.Value.CompareTo(nextPair.Value);
            //    }
            //);
            //totaltermlist.Sort((firstPair, nextPair) =>
            //{
            //    return firstPair.Value.CompareTo(nextPair.Value);
            //}
            //);

            // Read in Dictionary File
            // Dictionary File is made of Fixed-Length Records
            #region ParseDictionary
            dictFile = new StreamReader(dictionarypath.ToString());

            string dictline = "";

            // read in dictionary line by line (each line is a record of fixed length)
            while ((dictline = dictFile.ReadLine()) != null)
            {
                // 35 chars for term + ":" + 5 for df + ":" + 7 index into posting list
                string dictterm = dictline.Substring(0, 35).Trim();
                dictline = dictline.Remove(0, 36);
                string dictdocfreq = dictline.Substring(0, 5).Trim();
                dictline = dictline.Remove(0, 6);
                string dictpostinx = dictline.Substring(0, 7).Trim();

                memDictionary.Add(dictterm, dictdocfreq + " " + dictpostinx);
            }

            dictFile.Close();
            #endregion

            // Read in Queries from File
            #region ParseQueries
            queryFile = new StreamReader(queryTermspath.ToString());

            string line = "";

            // each query needs to be on its own line
            while ((line = queryFile.ReadLine()) != null)
            {
                queries.Add(new Query(line));
            }

            queryFile.Close();
            #endregion

            // Process each query, one at a time
            #region ProcessQueries
            FileStream postingslist = new FileStream(postingsfilepath.ToString(), FileMode.Open);

            // process each query in order
            foreach (Query q in queries)
            {
                List <string> strs = q.getTokens();

                foreach (string s in strs)
                {
                    // is query term in dictionary?
                    if (memDictionary.ContainsKey(s))
                    {
                        if (postingsCache.ContainsKey(s))
                        {
                            // we already have the records in our cache
                            q.addResults(s, postingsCache[s]);
                        }
                        else
                        {
                            byte[]        record;
                            List <string> dfidx = new List <string>(memDictionary[s].Split(' '));

                            int docFreq       = Int32.Parse(dfidx[0].Trim());
                            int posting_index = Int32.Parse(dfidx[1].Trim());

                            // Read entries from postings file
                            postingslist.Seek(posting_index * recordSize, SeekOrigin.Begin);

                            record = new byte[recordSize * docFreq];

                            postingslist.Read(record, 0, recordSize * docFreq);

                            string record_as_string = System.Text.ASCIIEncoding.ASCII.GetString(record);

                            //Console.WriteLine("Term: " + s + " DocFreq: " + docFreq.ToString() + " Offset: " + posting_index.ToString());
                            RecordSet rs = new RecordSet(record_as_string);

                            q.addResults(s, rs);
                            postingsCache.Add(s, rs);
                        }
                    }
                }
            }

            postingslist.Close();
            #endregion

            // Now that the queries are built and have their results, return this information to the user
            #region ProcessQueryResults
            foreach (Query q in queries)
            {
                Console.WriteLine("Query: " + q.getQuery());

                if (q.getResultsCount() > 0)
                {
                    IOrderedEnumerable <KeyValuePair <string, double> > results = q.getResults();

                    int cnt = (q.getResultsCount() > 10) ? 10 : q.getResultsCount();

                    for (int i = 0; i < cnt; i++)
                    {
                        KeyValuePair <string, double> kvp = results.ElementAt(i);

                        Console.WriteLine("Document Id: " + kvp.Key + " Score: " + kvp.Value.ToString());
                    }
                }
                else
                {
                    Console.WriteLine("Found no documents matching your query.");
                }
            }
            #endregion

            DateTime stop = DateTime.Now;
            Console.WriteLine("Total time: " + (stop - start).ToString());
        }
Beispiel #3
0
        public Query(string query)
        {
            // Queries must not have repeated terms
            Regex r = new Regex("(?<term>.+?)=(?<weight>.+)");

            double weightsum = 0.0;

            this.m_query   = query;
            this.m_terms   = new Dictionary <string, double>();
            this.m_results = new Dictionary <string, double>();

            // Break apart the query into terms with their weights
            List <string> termGroups = new List <string>(query.Split(' '));

            foreach (string t in termGroups)
            {
                Match m = r.Match(t);

                if (m.Success)
                {
                    double weight = Double.Parse(m.Groups["weight"].Value);

                    if (weight > 1.0 || weight < 0)
                    {
                        Console.WriteLine("Error: Invalid Weight for Token: " + t);
                    }
                    else
                    {
                        weightsum += weight;

                        Dictionary <string, int> breakup = new Dictionary <string, int>(HTMLParser.tokenize_string(m.Groups["term"].Value));

                        // Note if the term is kept as 1 term, the breakup is 1, and therefore the term maintains the entered weight
                        foreach (KeyValuePair <string, int> kvp in breakup)
                        {
                            this.m_terms.Add(kvp.Key, weight / breakup.Count);
                        }
                    }
                }
                else
                {
                    Console.WriteLine("Error: Incorrect Syntax");
                }
            }

            // Determine sum of values, and add extra bits to longest term
            if (weightsum < 1.0)
            {
                double extra   = 1.0 - weightsum;
                string longest = string.Empty;

                // Find the longest term
                foreach (string s in this.m_terms.Keys)
                {
                    if (longest.Length < s.Length)
                    {
                        longest = String.Copy(s);
                    }
                }

                //Console.WriteLine("Adding Extra " + extra.ToString() + " to term: " + longest);

                this.m_terms[longest] += extra;
            }
            else if (weightsum > 1.0)
            {
                Console.WriteLine("Error: Total weights provided is greater than 1.0");
            }
        }
Beispiel #4
0
        static void Main(string[] args)
        {
            StringBuilder srcdirpath      = new StringBuilder(256);
            StringBuilder indexdirpath    = new StringBuilder(256);
            StringBuilder indexdirtmppath = new StringBuilder(256);

            // globalTermsDict is the set of all terms and their global frequencies
            Dictionary <string, int> globalTermsDict = new Dictionary <string, int>();

            // idfTermsDict to calculated the idf for each term
            Dictionary <string, double> idfTermsDict = new Dictionary <string, double>();

            // termDocMatrix will tell us which documents have which term, by term
            // NOTE: Awesomely enough using this to find occurences cut the time to process 503 files by more than half
            // Before we were going through each document asking if it contained the term; now we just know a priori
            // Also used to calculate document frequencies
            Dictionary <string, List <string> > termDocMatrix = new Dictionary <string, List <string> >();

            // termsPerDoc is a list of term hashes for each document (term=>tf)
            Dictionary <string, Dictionary <string, int> > termsPerDoc = new Dictionary <string, Dictionary <string, int> >();

            // fileSizes holds the total count of all terms including stop words
            Dictionary <string, int> fileSizes = new Dictionary <string, int>();

            // stopWords is the set of stopwords
            List <string> stopWords = new List <string>();

            StreamWriter output;
            StreamWriter postings;

            DateTime start = DateTime.Now;

            // Note: We now use a document ID to identify the hash list for each document
            //       and the size of each document.  This hash of hashes is really quick.
            //       I would imagine it is pointer to a pointer construction.  Very quick lookup.
            //       Timing:
            //       For redone Part 2
            //       With just changing the current codebase to use a hash lookup for each list of things
            //       Can now go through the 503 documents in 36 seconds, whereas before it was 60+

            // Process Input Parameters
            #region ProcessInput
            if (args.Length != 2)
            {
                System.Console.WriteLine("Invalid Parameters");
                return;
            }
            else
            {
                srcdirpath.Append(args[0]);
                indexdirpath.Append(args[1]);
            }
            #endregion

            // Open up the Input Directories
            #region DirectoryHandling
            DirectoryInfo srcdirinfo   = new DirectoryInfo(srcdirpath.ToString());
            DirectoryInfo indexdirinfo = new DirectoryInfo(indexdirpath.ToString());

            indexdirtmppath.Append(indexdirpath.ToString() + "/tmp");

            DirectoryInfo indexdirtmpinfo = new DirectoryInfo(indexdirtmppath.ToString());

            if (!srcdirinfo.Exists)
            {
                System.Console.WriteLine("Directory does not appear to exist");
                return;
            }

            if (!indexdirinfo.Exists)
            {
                System.Console.WriteLine("Creating /index directory");
                indexdirinfo.Create();
            }

            //if (!indexdirtmpinfo.Exists)
            //{
            //    System.Console.WriteLine("Creating /tmp directory");
            //    indexdirtmpinfo.Create();
            //}
            #endregion

            // Glob in all stop words
            #region StopWords
            // We're going to currently assume the stopwords.txt file is with the src files
            List <FileInfo> stopFileList = srcdirinfo.GetFiles("stoplist*").ToList();
            foreach (FileInfo file in stopFileList)
            {
                //its, lets, well, were are not duplicated
                HTMLParser stopW           = new HTMLParser(file.FullName);
                Dictionary <string, int> s = new Dictionary <string, int>(stopW.tokenize());

                foreach (string str in s.Keys)
                {
                    stopWords.Add(str);
                }
            }

            #endregion

            // Iterate over all the HTML Files from the Source Directory
            #region IterateOverInputFiles
            List <FileInfo> srcfilelist = srcdirinfo.GetFiles("*.html").ToList();
            foreach (FileInfo file in srcfilelist)
            {
                HTMLParser    fileTerms = new HTMLParser(file.FullName);
                StringBuilder newName   = new StringBuilder(file.Name);

                newName.Replace("html", "txt");
                string docID = newName.Replace(".txt", "").ToString();
                fileSizes.Add(docID, 0);

                // Parse the File (the output is a set of terms)
                Dictionary <string, int> localTermDict = new Dictionary <string, int>(fileTerms.tokenize());
                Dictionary <string, int> cleanTermDict = new Dictionary <string, int>();
                // have to use separate because it doesn't like it when i edit the primary while looping

                IOrderedEnumerable <KeyValuePair <string, int> > termssortedhashlist =
                    (from entry in localTermDict orderby entry.Key ascending select entry);
                foreach (KeyValuePair <string, int> k in termssortedhashlist)
                {
                    fileSizes[docID] += k.Value;

                    if (!stopWords.Contains(k.Key))
                    {
                        cleanTermDict.Add(k.Key, k.Value);

                        // this will hold all terms and their global counts
                        // so we can track down singletons
                        if (globalTermsDict.ContainsKey(k.Key))
                        {
                            globalTermsDict[k.Key] += k.Value;
                        }
                        else
                        {
                            globalTermsDict.Add(k.Key, k.Value);
                        }

                        // this is our term document matrix in memory
                        if (termDocMatrix.ContainsKey(k.Key))
                        {
                            termDocMatrix[k.Key].Add(docID);
                        }
                        else
                        {
                            termDocMatrix.Add(k.Key, new List <string>());
                            termDocMatrix[k.Key].Add(docID);
                        }
                    }
                }

                termsPerDoc.Add(docID, cleanTermDict);
            } // end foreach input file
            #endregion

            // Remove wors which only appear once in the corpus
            #region CleanTermsLists

            IEnumerable <KeyValuePair <string, int> > reallyUniqueTerms =
                (from entry in globalTermsDict where entry.Value == 1 select entry);

            foreach (KeyValuePair <string, int> k in reallyUniqueTerms)
            {
                // we know which docs have this unique term, can eliminate quickly!
                foreach (string dID in termDocMatrix[k.Key])
                {
                    termsPerDoc[dID].Remove(k.Key);
                }

                termDocMatrix.Remove(k.Key);
            }
            #endregion

            //StringBuilder debugpath = new StringBuilder(indexdirinfo.FullName + "\\");
            //foreach (KeyValuePair<string, Dictionary<string, int>> k in termsPerDoc)
            //{
            //    output = new StreamWriter(debugpath.ToString() + k.Key + ".txt", false);

            //    foreach (KeyValuePair<string, int> d in k.Value)
            //    {
            //        output.WriteLine(d.Key);
            //    }

            //    output.Close();
            //}

            // Computer the IDF for each term in one pass
            #region ComputeIDF
            int totalDoc = termsPerDoc.Count;
            Console.WriteLine("Computed " + totalDoc.ToString() + " input files");
            foreach (KeyValuePair <string, List <string> > k in termDocMatrix)
            {
                idfTermsDict.Add(k.Key, Math.Log10((double)totalDoc / (double)k.Value.Count));
            }
            #endregion

            // Could Write out Temporary Files Here
            // Would be beneficial to write out term document matrix

            // Print out Dictionary
            #region BuildDictionaryAndPostingsFile
            // Each fixed length record contains:
            // word(35 bytes), df(5 bytes), location of first record in posting file(5 bytes)
            StringBuilder dictionarypath   = new StringBuilder(indexdirinfo.FullName + "\\dictionary.txt");
            StringBuilder postingsfilepath = new StringBuilder(indexdirinfo.FullName + "\\postings.txt");

            output   = new StreamWriter(dictionarypath.ToString(), false);
            postings = new StreamWriter(postingsfilepath.ToString(), false);

            int recordIndex = 0;

            IOrderedEnumerable <KeyValuePair <string, List <string> > > sortedTermsfromMatrix =
                (from entry in termDocMatrix orderby entry.Key ascending select entry);

            foreach (KeyValuePair <string, List <string> > k in sortedTermsfromMatrix)
            {
                output.WriteLine(k.Key.PadRight(35) + ":" + k.Value.Count.ToString().PadLeft(5) + ":" + recordIndex.ToString().PadLeft(7));

                // instantly grab all occurences of the key and add to the postings file.
                foreach (string dID in k.Value)
                {
                    // so we know the docID of each document with the thing
                    double normWeight = (double)termsPerDoc[dID][k.Key] / (double)fileSizes[dID];
                    double termWeight = normWeight * idfTermsDict[k.Key];

                    string tmp = String.Format("{0:.######}", termWeight);

                    // works well if the max term weight is 9.999999
                    if (termWeight < 1.0)
                    {
                        tmp = tmp.Insert(0, "0");
                    }

                    postings.WriteLine(dID.PadRight(10) + ":" + tmp.PadRight(9, '0'));

                    recordIndex += 1;
                }
            }
            postings.Close();
            output.Close();
            #endregion

            //totaltermlist.Sort(
            //    delegate(KeyValuePair<string, string> firstPair, KeyValuePair<string, string> nextPair)
            //    {
            //        return firstPair.Value.CompareTo(nextPair.Value);
            //    }
            //);
            //totaltermlist.Sort((firstPair, nextPair) =>
            //{
            //    return firstPair.Value.CompareTo(nextPair.Value);
            //}
            //);

            DateTime stop = DateTime.Now;
            Console.WriteLine("Total time: " + (stop - start).ToString());
        }
Beispiel #5
0
        public static Dictionary <string, int> tokenize_string(string query)
        {
            // effectively the same, except this can used as a library call on a string
            int         bytesRead = 0;
            char        b;
            ParserState state = ParserState.InsideToken;
            Dictionary <string, int> tokens   = new Dictionary <string, int>();
            StringBuilder            newToken = new StringBuilder();

            // make lowercase
            query = query.ToLower();

            // tokenize the string
            while (bytesRead < query.Length)
            {
                b = (char)query[bytesRead];

                // process byte
                switch (b)
                {
                case '<':
                    if (state == ParserState.InsideToken)
                    {
                        // end current token
                        if (newToken.Length > 1)
                        {
                            HTMLParser.addToken(tokens, newToken.ToString());
                        }

                        newToken.Remove(0, newToken.Length);

                        state = ParserState.InsideTag;
                    }
                    else if (state != ParserState.InsideTag)
                    {
                        // we just started a tag
                        state = ParserState.InsideTag;
                    }
                    break;

                case '>':
                    if (state == ParserState.InsideTag)
                    {
                        state = ParserState.InsideToken;
                    }
                    break;

                case '&':
                    // # we only go into specialstate if we are inside a token and by that i
                    //   mean in the middle/end of a token
                    if (state == ParserState.InsideToken && newToken.Length > 0)
                    {
                        state = ParserState.InsideSpecial;
                    }
                    break;

                case ';':
                    if (state == ParserState.InsideSpecial)
                    {
                        // we go back to regular token because we had to
                        // have been in this state before special
                        state = ParserState.InsideToken;
                    }
                    break;

                default:
                    if (state == ParserState.InsideToken)
                    {
                        if ((b >= 'a' && b <= 'z') || (b >= '0' && b <= '9'))
                        {
                            newToken.Append(b);
                        }
                        else if ((b > 'z' || b < 'a') && b != '.' && b != '\'')
                        {
                            if (newToken.Length > 1)
                            {
                                HTMLParser.addToken(tokens, newToken.ToString());
                            }

                            newToken.Remove(0, newToken.Length);
                        }
                    }
                    else if (state == ParserState.InsideSpecial)
                    {
                        if (b == '\n' || b == ' ' || b == '\t' || b == '\r')
                        {
                            if (newToken.Length > 1)
                            {
                                HTMLParser.addToken(tokens, newToken.ToString());
                            }

                            newToken.Remove(0, newToken.Length);
                        }
                    }
                    break;
                }

                bytesRead++;
            }

            if (newToken.Length > 1)
            {
                HTMLParser.addToken(tokens, newToken.ToString());
            }

            return(new Dictionary <string, int>(tokens));
        }