private static List<string> GetVocabulary(Document doc, out List<List<string>> stemmedDocs, Configuration config, int vocabularyThreshold) { List<string> vocabulary = new List<string>(); Dictionary<string, int> wordCountList = new Dictionary<string, int>(); stemmedDocs = new List<List<string>>(); int docIndex = 0; foreach (Sentence sentence in doc.sentences) { List<string> stemmedDoc = new List<string>(); docIndex++; //string[] parts2 = config.Tokenizer.Tokenize(sentence.sent);//?????? //List<string> words = new List<string>(); foreach (wordDetails part in sentence.words) { // Strip non-alphanumeric characters. string stripped = Regex.Replace(part.word, "[^a-zA-Z0-9]", ""); if (!StopWords.stopWordsList.Contains(stripped.ToLower())) { try { string stem = config.Stemmer.Stem(stripped); if (stem.Length > 0) { // Build the word count list. if (wordCountList.ContainsKey(stem)) { wordCountList[stem]++; } else { wordCountList.Add(stem, 0); } stemmedDoc.Add(stem); } } catch { Console.WriteLine("There is some error in Stemming"); } } } stemmedDocs.Add(stemmedDoc); } // Get the top words. var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold); foreach (var item in vocabList) { vocabulary.Add(item.Key); } return vocabulary; }
public void ProcessSearchStringWorkItem(string InputFilePath, string outFilePath) { FileStream fs1 = new FileStream(outFilePath, FileMode.Create); StreamWriter sw1 = new StreamWriter(fs1); Document doc = new Document(InputFilePath); //Preprocess Preprocessor.extractSearchString(doc); Preprocessor.process(doc, _config); Preprocessor.PrepareTraining(doc, _config); }
public void ProcessWorkItem(string InputFilePath, string outFilePath) { FileStream fs1 = new FileStream(outFilePath, FileMode.Create); StreamWriter sw1 = new StreamWriter(fs1); Document doc = new Document(InputFilePath); //Preprocess Preprocessor.process(doc, _config); //Summerize Console.WriteLine("Summerizing the job Descriptions.....\n"); _config.Summarizer.Summarize(doc ,_config); Console.WriteLine("\nDescription:\n{0}\n\nMust Have:\n{1}\n\nGood To Have:\n{2}\n\n", doc.description,doc.MustHave,doc.GoodTohave); string allResults = "\nDescription:\n" + doc.description + "\n\nMust Have:\n" + doc.MustHave + "\n\nGood To Have:\n" + doc.GoodTohave + "\n\n"; sw1.WriteLine(allResults); //Extract entities... sw1.Close(); }
void DescriptionSummary(Document doc) { string summary = ""; Dictionary<int, double> DescriptionSentScores = new Dictionary<int, double>(); Dictionary<int, string> sentences = new Dictionary<int, string>(); foreach (Sentence S in doc.sentences) { double isDescription = 0; //string parsedOutput = config.Parser.parse(S.sent.ToLower()); if (S.paraHeading.ToLower().Contains("description")) { isDescription = 5; } //This must be done more effectively using any Classification ML algorithm. For that we need manually annotated data. So leaving that for future.. S.Descriptionscore = S.TFIDFScore + S.Lenght * 0.001 + S.JJCount * 0.003 + (1 / S.NoSent) * 0.5 + S.upperCaseLettersCount * 0.05 + isDescription; // Console.WriteLine("Sentence: {0} \t Score: {1}\n", S.NoSent, S.score); DescriptionSentScores.Add(S.NoSent, S.Descriptionscore); sentences.Add(S.NoSent, S.sent); } var sortedsents = from sent in DescriptionSentScores orderby sent.Value descending select sent; int top = 0; SortedDictionary<int, string> extracted = new SortedDictionary<int, string>(); foreach (var sortedSent in sortedsents) { if (top > 4) break; //summary = summary + sentences[sortedSent.Key].ToString() + "\n"; extracted.Add(sortedSent.Key, sentences[sortedSent.Key].ToString()); //Console.WriteLine("{0}",sentences[sortedSent.Key].ToString()); top++; } foreach (var sent in extracted) { summary = summary + sent.Value; } doc.description = summary; }
public static void Transform(Document doc, Configuration config, int vocabularyThreshold = 3) { List<List<string>> stemmedDocs; List<string> vocabulary; // Get the vocabulary and stem the documents at the same time. vocabulary = GetVocabulary(doc, out stemmedDocs, config, vocabularyThreshold); if (_IDF.Count == 0) { // Calculate the IDF for each vocabulary term. foreach (var term in vocabulary) { double numberOfDocsContainingTerm = stemmedDocs.Where(d => d.Contains(term)).Count(); _IDF[term] = Math.Log((double)stemmedDocs.Count / ((double)1 + numberOfDocsContainingTerm)); } } // Transform each document into a vector of tfidf values. TransformToTFIDFVectors(doc, config, _IDF); }
void GoodToHaveSummary(Document doc) { Dictionary<int, double> QualificationSentScores = new Dictionary<int, double>(); Dictionary<int, string> Qualificationsentences = new Dictionary<int, string>(); foreach (Sentence S in doc.sentences) { //Getting the Must Have scores... double isQualifiation = 0; if (S.paraHeading.ToLower().Contains("qualification") || !S.paraHeading.ToLower().Contains("description")) { isQualifiation = 10; } string MustHaveRegex = "(must|required|should|minimum|strong)"; Match m = Regex.Match(S.sent, MustHaveRegex); double MustHaveWords = 0; if (!m.Success) { MustHaveWords = 5; } S.Qualificationscore = S.TFIDFScore + S.Lenght * 0.001 + S.JJCount * 0.003 + S.nounCount * 0.05 + S.upperCaseLettersCount * 0.5 + isQualifiation + MustHaveWords; QualificationSentScores.Add(S.NoSent, S.Qualificationscore); Qualificationsentences.Add(S.NoSent, S.sent); } string qualificationSumm = ""; int Qtop = 0; var qualSortedSents = from sent in QualificationSentScores orderby sent.Value descending select sent; foreach (var sortedSent in qualSortedSents) { if (Qtop > 2) break; qualificationSumm = qualificationSumm + Qualificationsentences[sortedSent.Key].ToString() + "\n"; //Console.WriteLine("{0}",sentences[sortedSent.Key].ToString()); Qtop++; } doc.GoodTohave = qualificationSumm; }
private static void TransformToTFIDFVectors(Document doc, Configuration config, Dictionary<string, double> vocabularyIDF) { foreach (Sentence S in doc.sentences) { List<double> vector = new List<double>(); foreach (var vocab in vocabularyIDF) { // Term frequency = count how many times the term appears in this document. double tf = S.words.Where(d => d.word == vocab.Key).Count(); double tfidf = tf * vocab.Value; vector.Add(tfidf); } double[] tfids = vector.Select(v => v).ToArray(); tfids = L2Normalization.Normalize(tfids); foreach (double tfidf in tfids) { S.TFIDFScore = S.TFIDFScore + tfidf; } } }