Exemplo n.º 1
0
        private static List<string> GetVocabulary(Document doc, out List<List<string>> stemmedDocs, Configuration config, int vocabularyThreshold)
        {
            List<string> vocabulary = new List<string>();
            Dictionary<string, int> wordCountList = new Dictionary<string, int>();
            stemmedDocs = new List<List<string>>();

            int docIndex = 0;
            foreach (Sentence sentence in doc.sentences)
            {
                List<string> stemmedDoc = new List<string>();
                docIndex++;
                //string[] parts2 = config.Tokenizer.Tokenize(sentence.sent);//??????
                //List<string> words = new List<string>();
                foreach (wordDetails part in sentence.words)
                {
                    // Strip non-alphanumeric characters.
                    string stripped = Regex.Replace(part.word, "[^a-zA-Z0-9]", "");
                    if (!StopWords.stopWordsList.Contains(stripped.ToLower()))
                    {
                        try
                        {
                            string stem = config.Stemmer.Stem(stripped);
                            if (stem.Length > 0)
                            {
                                // Build the word count list.
                                if (wordCountList.ContainsKey(stem))
                                {
                                    wordCountList[stem]++;
                                }
                                else
                                {
                                    wordCountList.Add(stem, 0);
                                }
                                stemmedDoc.Add(stem);
                            }
                        }
                        catch
                        {
                            Console.WriteLine("There is some error in Stemming");
                        }
                    }

                }
                stemmedDocs.Add(stemmedDoc);
            }

            // Get the top words.
            var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold);

            foreach (var item in vocabList)
            {
                vocabulary.Add(item.Key);
            }

            return vocabulary;
        }
Exemplo n.º 2
0
 public void ProcessSearchStringWorkItem(string InputFilePath, string outFilePath)
 {
     FileStream fs1 = new FileStream(outFilePath, FileMode.Create);
     StreamWriter sw1 = new StreamWriter(fs1);
     Document doc = new Document(InputFilePath);
     //Preprocess
     Preprocessor.extractSearchString(doc);
     Preprocessor.process(doc, _config);
     Preprocessor.PrepareTraining(doc, _config);
 }
Exemplo n.º 3
0
 public void ProcessWorkItem(string InputFilePath, string outFilePath)
 {
     FileStream fs1 = new FileStream(outFilePath, FileMode.Create);
     StreamWriter sw1 = new StreamWriter(fs1);
     Document doc = new Document(InputFilePath);
     //Preprocess
     Preprocessor.process(doc, _config);
     //Summerize
     Console.WriteLine("Summerizing the job Descriptions.....\n");
     _config.Summarizer.Summarize(doc ,_config);
     Console.WriteLine("\nDescription:\n{0}\n\nMust Have:\n{1}\n\nGood To Have:\n{2}\n\n", doc.description,doc.MustHave,doc.GoodTohave);
     string allResults = "\nDescription:\n" + doc.description + "\n\nMust Have:\n" + doc.MustHave + "\n\nGood To Have:\n" + doc.GoodTohave + "\n\n";
     sw1.WriteLine(allResults);
     //Extract entities...
     sw1.Close();
 }
        void DescriptionSummary(Document doc)
        {
            string summary = "";

            Dictionary<int, double> DescriptionSentScores = new Dictionary<int, double>();
            Dictionary<int, string> sentences = new Dictionary<int, string>();
            foreach (Sentence S in doc.sentences)
            {
                double isDescription = 0;
                //string parsedOutput = config.Parser.parse(S.sent.ToLower());
                if (S.paraHeading.ToLower().Contains("description"))
                {
                    isDescription = 5;
                }
                //This must be done more effectively using any Classification ML algorithm. For that we need manually annotated data. So leaving that for future..
                S.Descriptionscore = S.TFIDFScore + S.Lenght * 0.001 + S.JJCount * 0.003 + (1 / S.NoSent) * 0.5 + S.upperCaseLettersCount * 0.05 + isDescription;

                // Console.WriteLine("Sentence: {0} \t Score: {1}\n", S.NoSent, S.score);
                DescriptionSentScores.Add(S.NoSent, S.Descriptionscore);
                sentences.Add(S.NoSent, S.sent);
            }

            var sortedsents = from sent in DescriptionSentScores orderby sent.Value descending select sent;
            int top = 0;
            SortedDictionary<int, string> extracted = new SortedDictionary<int, string>();
            foreach (var sortedSent in sortedsents)
            {
                if (top > 4)
                    break;
                //summary = summary + sentences[sortedSent.Key].ToString() + "\n";
                extracted.Add(sortedSent.Key, sentences[sortedSent.Key].ToString());
                //Console.WriteLine("{0}",sentences[sortedSent.Key].ToString());
                top++;
            }

            foreach (var sent in extracted)
            {
                summary = summary + sent.Value;
            }
            doc.description = summary;
        }
Exemplo n.º 5
0
        public static void Transform(Document doc, Configuration config, int vocabularyThreshold = 3)
        {
            List<List<string>> stemmedDocs;
            List<string> vocabulary;

            // Get the vocabulary and stem the documents at the same time.

            vocabulary = GetVocabulary(doc, out stemmedDocs, config, vocabularyThreshold);

            if (_IDF.Count == 0)
            {
                // Calculate the IDF for each vocabulary term.
                foreach (var term in vocabulary)
                {
                    double numberOfDocsContainingTerm = stemmedDocs.Where(d => d.Contains(term)).Count();
                    _IDF[term] = Math.Log((double)stemmedDocs.Count / ((double)1 + numberOfDocsContainingTerm));
                }
            }

            // Transform each document into a vector of tfidf values.
            TransformToTFIDFVectors(doc, config, _IDF);
        }
        void GoodToHaveSummary(Document doc)
        {
            Dictionary<int, double> QualificationSentScores = new Dictionary<int, double>();
            Dictionary<int, string> Qualificationsentences = new Dictionary<int, string>();
            foreach (Sentence S in doc.sentences)
            {
                //Getting the Must Have scores...
                double isQualifiation = 0;
                if (S.paraHeading.ToLower().Contains("qualification") ||
                    !S.paraHeading.ToLower().Contains("description"))
                {
                    isQualifiation = 10;
                }
                string MustHaveRegex = "(must|required|should|minimum|strong)";
                Match m = Regex.Match(S.sent, MustHaveRegex);
                double MustHaveWords = 0;
                if (!m.Success)
                {
                    MustHaveWords = 5;
                }
                S.Qualificationscore = S.TFIDFScore + S.Lenght * 0.001 + S.JJCount * 0.003 + S.nounCount * 0.05 + S.upperCaseLettersCount * 0.5 + isQualifiation + MustHaveWords;
                QualificationSentScores.Add(S.NoSent, S.Qualificationscore);
                Qualificationsentences.Add(S.NoSent, S.sent);
            }
            string qualificationSumm = "";
            int Qtop = 0;

            var qualSortedSents = from sent in QualificationSentScores orderby sent.Value descending select sent;
            foreach (var sortedSent in qualSortedSents)
            {
                if (Qtop > 2)
                    break;
                qualificationSumm = qualificationSumm + Qualificationsentences[sortedSent.Key].ToString() + "\n";
                //Console.WriteLine("{0}",sentences[sortedSent.Key].ToString());
                Qtop++;
            }
            doc.GoodTohave = qualificationSumm;
        }
Exemplo n.º 7
0
        private static void TransformToTFIDFVectors(Document doc, Configuration config, Dictionary<string, double> vocabularyIDF)
        {
            foreach (Sentence S in doc.sentences)
            {

                List<double> vector = new List<double>();

                foreach (var vocab in vocabularyIDF)
                {
                    // Term frequency = count how many times the term appears in this document.
                    double tf = S.words.Where(d => d.word == vocab.Key).Count();
                    double tfidf = tf * vocab.Value;
                    vector.Add(tfidf);
                }
                double[] tfids = vector.Select(v => v).ToArray();
                tfids = L2Normalization.Normalize(tfids);
                foreach (double tfidf in tfids)
                {
                    S.TFIDFScore = S.TFIDFScore + tfidf;
                }
            }
        }