/** * Returns a sub sequence of the specified {@link CharSequence}, with leading and trailing whitespace omitted. If * the CharSequence has length zero, this returns a reference to the CharSequence. If the CharSequence represents * and empty character sequence, this returns an empty CharSequence. * * @param charSequence the CharSequence to trim. * * @return a sub sequence with leading and trailing whitespace omitted. * * @throws ArgumentException if the charSequence is null. */ public static CharSequence trimCharSequence(CharSequence charSequence) { if (charSequence == null) { String message = Logging.getMessage("nullValue.CharSequenceIsNull"); Logging.logger().severe(message); throw new ArgumentException(message); } int len = charSequence.length(); if (len == 0) { return(charSequence); } int start, end; for (start = 0; (start < len) && charSequence.charAt(start) == ' '; start++) { } for (end = charSequence.length() - 1; (end > start) && charSequence.charAt(end) == ' '; end--) { } return(charSequence.subSequence(start, end + 1)); }
private static string extractTagName(CharSequence tagChars) { int fromOffset = 1; if (tagChars.length() > 1 && tagChars.charAt(1) == '/') { fromOffset = 2; } for (int ci = 1; ci < tagChars.length(); ci++) { if (tagChars.charAt(ci) == '>' || StringUtil.isWhitespace(tagChars.charAt(ci))) { return(tagChars.subSequence(fromOffset, ci).ToString()); } } throw new InvalidFormatException("Failed to extract tag name!"); }
//Apply to one disease only public DiseaseData GetPredictionDataCountFromPublicationsOfOneDisease(List <Publication> publications, Disease disease) { DiseaseData PredictionData = new DiseaseData(disease, new RelatedEntities( type.Symptom, new List <RelatedEntity>() ) ); List <RelatedEntity> relatedEntities = PredictionData.RelatedEntities.RelatedEntitiesList; List <System.String> texts = new List <System.String>(); foreach (Publication publication in publications) { stringBuilder.Clear(); stringBuilder.Append(publication.title); stringBuilder.Append(" "); stringBuilder.Append(publication.abstractText); stringBuilder.Append(" "); stringBuilder.Append(publication.fullText); string text = stringBuilder.ToString(); //Text preprocessing text = text.ToLower(); //NAMED ENTITY RECOGNITION Chunking chunking = chunker.chunk(text); CharSequence cs = chunking.charSequence(); Set chunkSet = chunking.chunkSet(); Iterator iterator = chunkSet.iterator(); while (iterator.hasNext()) { Chunk chunk = (Chunk)iterator.next(); int start = chunk.start(); int end = chunk.end(); string str = cs.subSequence(start, end).toString(); int index = relatedEntities.FindIndex(symptom => symptom.Name.Equals(str) || symptom.Synonyms.IndexOf(str) != -1); if (index != -1) { //relatedEntities[index].Weight++; relatedEntities[index].TermFrequencies.Where(tf => tf.TFType == TFType.RawCount).FirstOrDefault().Value++; } else { //Find infos from phenotypes lists Symptom symptomFromPhetotypes = symptomsList.Where(x => x.Name.Equals(str) || x.Synonyms.IndexOf(str) != -1).FirstOrDefault(); //Add the real Symptom if it exists if (symptomFromPhetotypes != null) { RelatedEntity myRealEntity = new RelatedEntity( type.Symptom, symptomFromPhetotypes.Name, 1.0, symptomFromPhetotypes.Synonyms ); myRealEntity.TermFrequencies.Where(tf => tf.TFType == TFType.RawCount).FirstOrDefault().Value = 1.0; relatedEntities.Add(myRealEntity); } } } } /* * //Sort and Take only a the best symptoms (see config file) * PredictionData.RelatedEntities.RelatedEntitiesList = * PredictionData.RelatedEntities.RelatedEntitiesList * .OrderByDescending(x => x.TermFrequencies.Where(tf => tf.TFType == TFType.RawCount).FirstOrDefault().Value) * .Take(ConfigurationManager.Instance.config.MaxNumberSymptoms) * .ToList(); */ /* * ///TEEEEEEEEEEEST * extractedSymptoms = new List<Symptom>(); * for (int k = 0; k < 42; k++) * { * Symptom symptom = new Symptom(); * symptom.Name = "Paul"; * symptom.OrphaNumber = "caca"; * symptom.Weight = 42; * extractedSymptoms.Add(symptom); * }*/ return(PredictionData); }
//Apply to one disease only public DiseaseData GetPredictionDataFromPublicationsOfOneDisease(List <Publication> publications, Disease disease) { DiseaseData PredictionData = new DiseaseData(disease, new RelatedEntities( type.Symptom, new List <RelatedEntity>() ) ); List <RelatedEntity> relatedEntities = PredictionData.RelatedEntities.RelatedEntitiesList; List <System.String> texts = new List <System.String>(); foreach (Publication publication in publications) { string text = publication.title + " " + publication.abstractText + " " + publication.fullText; //Text preprocessing text = text.ToLower(); //NAMED ENTITY RECOGNITION Chunking chunking = chunkerHMM.chunk(text); CharSequence cs = chunking.charSequence(); Set chunkSet = chunking.chunkSet(); Iterator iterator = chunkSet.iterator(); while (iterator.hasNext()) { Chunk chunk = (Chunk)iterator.next(); int start = chunk.start(); int end = chunk.end(); string str = cs.subSequence(start, end).toString(); int index = relatedEntities.FindIndex(symptom => symptom.Name.Equals(str) || symptom.Synonyms.IndexOf(str) != -1); if (index != -1) { relatedEntities[index].Weight++; } else { //Find infos from phenotypes lists Symptom symptomFromPhetotypes = symptomsList.Where(x => x.Name.Equals(str) || x.Synonyms.IndexOf(str) != -1).FirstOrDefault(); //Add the real Symptom relatedEntities.Add( new RelatedEntity( type.Symptom, symptomFromPhetotypes.Name, 1.0, symptomFromPhetotypes.Synonyms ) ); } } } //Symptom Weight Normalization from 0 to 100 for (int i = 0; i < relatedEntities.Count; i++) { //Find Min and Max for Normalization double max = relatedEntities.Max(x => x.Weight); double min = relatedEntities.Min(x => x.Weight); //Normalization if (max == min)//If size==1 { if (relatedEntities[i].Weight > 100.0) { relatedEntities[i].Weight = 100.0; } } else { relatedEntities[i].Weight = 100 * (relatedEntities[i].Weight - min) / (max - min); } } //Sort related entities by descending weight PredictionData.RelatedEntities.RelatedEntitiesList.OrderByDescending(x => x.Weight).ToList(); //Take only a the best symptoms (see config file) PredictionData.RelatedEntities.RelatedEntitiesList = PredictionData.RelatedEntities.RelatedEntitiesList .OrderByDescending(x => x.Weight) .Take(ConfigurationManager.Instance.config.MaxNumberSymptoms) .ToList(); /* * ///TEEEEEEEEEEEST * extractedSymptoms = new List<Symptom>(); * for (int k = 0; k < 42; k++) * { * Symptom symptom = new Symptom(); * symptom.Name = "Paul"; * symptom.OrphaNumber = "caca"; * symptom.Weight = 42; * extractedSymptoms.Add(symptom); * }*/ return(PredictionData); }
internal CharSequence getSubSequence(int beginIndex, int endIndex) { return(text.subSequence(beginIndex, endIndex)); }