Esempio n. 1
0
        /// <summary>
        /// convers line from posting file to Term object
        /// </summary>
        /// <param name="line">string of Posting's data</param>
        /// <returns>Term, constructed from the posting-line received</returns>
        public Term convertPostingStringToTerm(string line)
        {
            Term result = new Term();
            string termString;
            //string type;
            Dictionary<string, StringBuilder> dLocations = new Dictionary<string, StringBuilder>();
            Dictionary<string, int> dDocTf = new Dictionary<string, int>();
            Dictionary<string, bool> dDocHeader = new Dictionary<string, bool>();

            //extract term string
            int termEndIndex = line.IndexOf('@');
            termString = line.Substring(0, termEndIndex);
            //cut the term string
            line = line.Substring(termEndIndex + 1);

            //extract documents and positions
            string[] docsDivisionArray = line.Split('|');

            foreach (string docString in docsDivisionArray)
            {
                if (docString.Length < 1)
                    continue;
                int currLoc, docTf;//, tfCount = 0;
                string docName;
                Boolean isHeader;
                StringBuilder sb = new StringBuilder();

                //extracting the docName
                currLoc = docString.IndexOf(';');
                docName = docString.Substring(0, currLoc);
                //removing docName
                string docLocationsString = docString.Substring(currLoc + 1);

                //extracting isHeader
                currLoc = docLocationsString.IndexOf('&');
                string tempHeader = docLocationsString.Substring(0, currLoc);
                if (tempHeader == "H")
                    isHeader = true;
                else
                    isHeader = false;
                //removing isHeader
                docLocationsString = docLocationsString.Substring(currLoc + 1);

                //extracting docTf
                currLoc = docLocationsString.IndexOf(':');
                docTf = Int32.Parse(docLocationsString.Substring(0, currLoc));
                //removing docTf
                docLocationsString = docLocationsString.Substring(currLoc + 1);

                string[] locationsDivisionArray = docLocationsString.Split(',');

                //iterating over locations
                foreach (string locString in locationsDivisionArray)
                {
                    if (locString.Length < 1)
                        continue;
                    sb.Append(locString + ",");
                    //tfCount++;
                }

                //updating dLocations and dDocTf with data
                dLocations[docName] = sb;
                dDocTf[docName] = docTf;
                dDocHeader[docName] = isHeader;
            }

            result.termString = termString;
            result.d_locations = dLocations;
            result.d_docTf = dDocTf;
            result.d_docHeader = dDocHeader;

            return result;
        }
Esempio n. 2
0
        /// <summary>
        /// adds a term to dictionary
        /// </summary>
        /// <param name="d_terms">dictionary to add to</param>
        /// <param name="term">term string to add</param>
        /// <param name="docName">docName</param>
        /// <param name="index">index of term</param>
        /// <param name="numOfTerms">numOfTerms in doc</param>
        /// <param name="type">term type</param>
        private static void addTermToDic(SortedDictionary<string, Term> d_terms, string term, string docName, int index, ref int numOfTerms, string type)
        {
            lock (d_terms)
            {
                if (!d_terms.ContainsKey(term))
                {
                    d_terms[term] = new Term(type, term);
                    numOfTerms++;
                }
                if (type[0] != 'H')
                {
                    d_terms[term].addPosition(docName, index);
                }
                else
                {
                    d_terms[term].addPosition(docName, index);
                    d_terms[term].d_docHeader[docName] = true;
                }

            }
            int tfDoc;
            // change max tf in doc if needed
            Doc doc = d_docs[docName];
            if (doc.d_TermsCount.ContainsKey(term))
            {
                tfDoc = ++doc.d_TermsCount[term];
            }
            else
            {
                doc.d_TermsCount[term] = 1;
                tfDoc = 1;
            }

            if (doc.maxtfCount < tfDoc)
            {
                doc.maxtfString = term;
                doc.maxtfCount = tfDoc;
            }
        }