/// <summary> /// convers line from posting file to Term object /// </summary> /// <param name="line">string of Posting's data</param> /// <returns>Term, constructed from the posting-line received</returns> public Term convertPostingStringToTerm(string line) { Term result = new Term(); string termString; //string type; Dictionary<string, StringBuilder> dLocations = new Dictionary<string, StringBuilder>(); Dictionary<string, int> dDocTf = new Dictionary<string, int>(); Dictionary<string, bool> dDocHeader = new Dictionary<string, bool>(); //extract term string int termEndIndex = line.IndexOf('@'); termString = line.Substring(0, termEndIndex); //cut the term string line = line.Substring(termEndIndex + 1); //extract documents and positions string[] docsDivisionArray = line.Split('|'); foreach (string docString in docsDivisionArray) { if (docString.Length < 1) continue; int currLoc, docTf;//, tfCount = 0; string docName; Boolean isHeader; StringBuilder sb = new StringBuilder(); //extracting the docName currLoc = docString.IndexOf(';'); docName = docString.Substring(0, currLoc); //removing docName string docLocationsString = docString.Substring(currLoc + 1); //extracting isHeader currLoc = docLocationsString.IndexOf('&'); string tempHeader = docLocationsString.Substring(0, currLoc); if (tempHeader == "H") isHeader = true; else isHeader = false; //removing isHeader docLocationsString = docLocationsString.Substring(currLoc + 1); //extracting docTf currLoc = docLocationsString.IndexOf(':'); docTf = Int32.Parse(docLocationsString.Substring(0, currLoc)); //removing docTf docLocationsString = docLocationsString.Substring(currLoc + 1); string[] locationsDivisionArray = docLocationsString.Split(','); //iterating over locations foreach (string locString in locationsDivisionArray) { if (locString.Length < 1) continue; sb.Append(locString + ","); //tfCount++; } //updating dLocations and dDocTf with data dLocations[docName] = sb; dDocTf[docName] = docTf; dDocHeader[docName] = isHeader; } result.termString = termString; result.d_locations = dLocations; result.d_docTf = dDocTf; result.d_docHeader = dDocHeader; return result; }
/// <summary> /// adds a term to dictionary /// </summary> /// <param name="d_terms">dictionary to add to</param> /// <param name="term">term string to add</param> /// <param name="docName">docName</param> /// <param name="index">index of term</param> /// <param name="numOfTerms">numOfTerms in doc</param> /// <param name="type">term type</param> private static void addTermToDic(SortedDictionary<string, Term> d_terms, string term, string docName, int index, ref int numOfTerms, string type) { lock (d_terms) { if (!d_terms.ContainsKey(term)) { d_terms[term] = new Term(type, term); numOfTerms++; } if (type[0] != 'H') { d_terms[term].addPosition(docName, index); } else { d_terms[term].addPosition(docName, index); d_terms[term].d_docHeader[docName] = true; } } int tfDoc; // change max tf in doc if needed Doc doc = d_docs[docName]; if (doc.d_TermsCount.ContainsKey(term)) { tfDoc = ++doc.d_TermsCount[term]; } else { doc.d_TermsCount[term] = 1; tfDoc = 1; } if (doc.maxtfCount < tfDoc) { doc.maxtfString = term; doc.maxtfCount = tfDoc; } }