private static PdfFirstPartData BuildFirstPartPdfData(string fileContent) { var stringReader = new StringReader(fileContent); string curLine; int lineNo = 0; var lineContents = new List <string>(); var lineLengths = new List <int>(); var lineStarts = new List <int>(); var wordsPerLine = new List <int>(); while ((curLine = stringReader.ReadLine()) != null) { lineNo++; if (IsEmptyLine(curLine)) { lineContents.Add(""); lineLengths.Add(0); lineStarts.Add(0); wordsPerLine.Add(0); continue; } curLine = StringUtils.UnifyString(StringUtils.RemoveLigatures(curLine)); string mainContent = curLine.Trim(); int mainIdx = curLine.IndexOf(mainContent); Debug.Assert(mainIdx >= 0); int preLength = mainIdx; //int postLength = curLine.Length - (mainIdx + mainContent.Length); int numWords = mainContent.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries).Length; lineContents.Add(mainContent); lineStarts.Add(preLength); lineLengths.Add(mainContent.Length); wordsPerLine.Add(numWords); if (lineNo > 30) { break; } } var firstPartData = new PdfFirstPartData() { LineContents = lineContents, LineLengths = lineLengths, LineStarts = lineStarts, WordsPerLine = wordsPerLine }; //firstPartData.PdfStyle = DetectPdfStyle(firstPartData); return(firstPartData); }
private static bool IsJairStyle(PdfFirstPartData firstPartData) { if (firstPartData.Count <= 4) { return(false); } if (firstPartData.LineStarts[0] == 0 && firstPartData.LineContents[0].ToLower().Contains("journal of")) { if (firstPartData.LineLengths[1] == 0 && firstPartData.LineLengths[2] == 0 && firstPartData.LineLengths[3] == 0) { return(true); } } return(false); }
private static bool IsJairStyle(PdfFirstPartData firstPartData) { if (firstPartData.Count <= 4) return false; if (firstPartData.LineStarts[0] == 0 && firstPartData.LineContents[0].ToLower().Contains("journal of")) { if (firstPartData.LineLengths[1] == 0 && firstPartData.LineLengths[2] == 0 && firstPartData.LineLengths[3] == 0) return true; } return false; }
private static PdfExtractedData ExtractFromUnknownPdf(PdfFirstPartData firstPartData) { List<string> authorNamesRegular = null, authorNamesByEmail = null, tempAuthNames = null; bool isAuthsByEmail = false; bool isAuthSure; int year = -1; string pubName = null; int authorsLineNo = -1; int titleLineEnd = -1; int titleLineNo = -1; int firstNonEmptyLine = FirstNonEmptyLine(firstPartData.WordsPerLine, 0); if (firstNonEmptyLine < 0) return null; bool titleEndIsMet = false; titleLineEnd = titleLineNo; for (int i = firstNonEmptyLine; i < firstPartData.Count; i++) { if (firstPartData.WordsPerLine[i] == 0) { if(titleLineNo >= 0) titleEndIsMet = true; continue; } if (titleLineNo < 0 && (StringLooksLikePdfHeader(firstPartData.LineContents[i]) || (i+1 < firstPartData.Count && StringLooksLikePdfHeader(firstPartData.LineContents[i+1])))) { ExtractHeaderInfo(firstPartData.LineContents[i], out year, out pubName); // now skip headers and change i int j = i + 1; for (; j < firstPartData.Count; j++) if (firstPartData.WordsPerLine[j] == 0) break; if(j < firstPartData.Count) // i.e., loop breaked { titleLineNo = FirstNonEmptyLine(firstPartData.WordsPerLine, j); i = titleLineNo; } else { break; } } else if(titleLineNo < 0) { titleLineNo = i; } // the email part may look like affiliation, so it must be placed above affiliation check if (titleLineNo >= 0 && IsAuthorNameEmailPair(firstPartData.LineContents[i], out tempAuthNames)) { if (!isAuthsByEmail && authorsLineNo >= 0) { if (titleLineEnd < authorsLineNo) titleLineEnd = authorsLineNo; } isAuthsByEmail = true; authorsLineNo = i; if (authorNamesByEmail == null) authorNamesByEmail = new List<string>(); authorNamesByEmail.AddRange(tempAuthNames); titleEndIsMet = true; } else if (titleLineNo >= 0 && StringLooksLikeAbstractHeader(firstPartData.LineContents[i])) { break; } else if (titleLineNo >= 0 && StringLooksLikeAffiliation(firstPartData.LineContents[i])) { titleEndIsMet = true; // do nothing yet //break; } else if (!titleEndIsMet && titleLineNo >= 0 && StringLooksLikeTitle(firstPartData.LineContents[i])) { titleLineEnd = i; } else if (titleLineNo >= 0 && !titleEndIsMet && i >= 1 && firstPartData.LineContents[i - 1].EndsWith(":")) { titleLineEnd = i; } else if (titleLineNo >= 0 && i != titleLineNo && !isAuthsByEmail && StringLooksLikeAuthorNames(firstPartData.LineContents[i], out isAuthSure, out tempAuthNames)) { // TODO: u may turn back the commented lines below //if (!titleEndIsMet && authorsLineNo >= 0 && authorsLineNo >= titleLineEnd) //{ // if(authorNamesRegular != null) // authorNamesRegular.Clear(); // titleLineEnd = authorsLineNo; //} if(authorNamesRegular == null) authorNamesRegular = new List<string>(); authorNamesRegular.AddRange(tempAuthNames); authorsLineNo = i; if (isAuthSure) { titleEndIsMet = true; } } else if (titleLineNo >= 0 && !titleEndIsMet && authorsLineNo < 0) { titleLineEnd = i; } } if (titleLineEnd < 0 && titleLineNo >= 0 && authorsLineNo > titleLineNo) titleLineEnd = authorsLineNo - 1; string strTitle = "null", strAuthors = "null"; if (titleLineNo >= 0) { if (titleLineNo == titleLineEnd) { strTitle = firstPartData.LineContents[titleLineNo]; } else { strTitle = ""; for (int i = titleLineNo; i <= titleLineEnd; i++) { strTitle += firstPartData.LineContents[i] + " "; } strTitle = strTitle.Trim(); } } if (authorsLineNo >= 0) { strAuthors = firstPartData.LineContents[authorsLineNo]; } return new PdfExtractedData { AuthorNames = isAuthsByEmail ? authorNamesByEmail : authorNamesRegular, AuthorsLine = strAuthors, Title = strTitle, Year = year, PubName = pubName }; }
private static PdfFirstPartData BuildFirstPartPdfData(string fileContent) { var stringReader = new StringReader(fileContent); string curLine; int lineNo = 0; var lineContents = new List<string>(); var lineLengths = new List<int>(); var lineStarts = new List<int>(); var wordsPerLine = new List<int>(); while ((curLine = stringReader.ReadLine()) != null) { lineNo++; if (IsEmptyLine(curLine)) { lineContents.Add(""); lineLengths.Add(0); lineStarts.Add(0); wordsPerLine.Add(0); continue; } curLine = StringUtils.UnifyString(StringUtils.RemoveLigatures(curLine)); string mainContent = curLine.Trim(); int mainIdx = curLine.IndexOf(mainContent); Debug.Assert(mainIdx >= 0); int preLength = mainIdx; //int postLength = curLine.Length - (mainIdx + mainContent.Length); int numWords = mainContent.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries).Length; lineContents.Add(mainContent); lineStarts.Add(preLength); lineLengths.Add(mainContent.Length); wordsPerLine.Add(numWords); if (lineNo > 30) break; } var firstPartData = new PdfFirstPartData() { LineContents = lineContents, LineLengths = lineLengths, LineStarts = lineStarts, WordsPerLine = wordsPerLine }; //firstPartData.PdfStyle = DetectPdfStyle(firstPartData); return firstPartData; }
private static PdfExtractedData ExtractFromUnknownPdf(PdfFirstPartData firstPartData) { List <string> authorNamesRegular = null, authorNamesByEmail = null, tempAuthNames = null; bool isAuthsByEmail = false; bool isAuthSure; int year = -1; string pubName = null; int authorsLineNo = -1; int titleLineEnd = -1; int titleLineNo = -1; int firstNonEmptyLine = FirstNonEmptyLine(firstPartData.WordsPerLine, 0); if (firstNonEmptyLine < 0) { return(null); } bool titleEndIsMet = false; titleLineEnd = titleLineNo; for (int i = firstNonEmptyLine; i < firstPartData.Count; i++) { if (firstPartData.WordsPerLine[i] == 0) { if (titleLineNo >= 0) { titleEndIsMet = true; } continue; } if (titleLineNo < 0 && (StringLooksLikePdfHeader(firstPartData.LineContents[i]) || (i + 1 < firstPartData.Count && StringLooksLikePdfHeader(firstPartData.LineContents[i + 1])))) { ExtractHeaderInfo(firstPartData.LineContents[i], out year, out pubName); // now skip headers and change i int j = i + 1; for (; j < firstPartData.Count; j++) { if (firstPartData.WordsPerLine[j] == 0) { break; } } if (j < firstPartData.Count) // i.e., loop breaked { titleLineNo = FirstNonEmptyLine(firstPartData.WordsPerLine, j); i = titleLineNo; } else { break; } } else if (titleLineNo < 0) { titleLineNo = i; } // the email part may look like affiliation, so it must be placed above affiliation check if (titleLineNo >= 0 && IsAuthorNameEmailPair(firstPartData.LineContents[i], out tempAuthNames)) { if (!isAuthsByEmail && authorsLineNo >= 0) { if (titleLineEnd < authorsLineNo) { titleLineEnd = authorsLineNo; } } isAuthsByEmail = true; authorsLineNo = i; if (authorNamesByEmail == null) { authorNamesByEmail = new List <string>(); } authorNamesByEmail.AddRange(tempAuthNames); titleEndIsMet = true; } else if (titleLineNo >= 0 && StringLooksLikeAbstractHeader(firstPartData.LineContents[i])) { break; } else if (titleLineNo >= 0 && StringLooksLikeAffiliation(firstPartData.LineContents[i])) { titleEndIsMet = true; // do nothing yet //break; } else if (!titleEndIsMet && titleLineNo >= 0 && StringLooksLikeTitle(firstPartData.LineContents[i])) { titleLineEnd = i; } else if (titleLineNo >= 0 && !titleEndIsMet && i >= 1 && firstPartData.LineContents[i - 1].EndsWith(":")) { titleLineEnd = i; } else if (titleLineNo >= 0 && i != titleLineNo && !isAuthsByEmail && StringLooksLikeAuthorNames(firstPartData.LineContents[i], out isAuthSure, out tempAuthNames)) { // TODO: u may turn back the commented lines below //if (!titleEndIsMet && authorsLineNo >= 0 && authorsLineNo >= titleLineEnd) //{ // if(authorNamesRegular != null) // authorNamesRegular.Clear(); // titleLineEnd = authorsLineNo; //} if (authorNamesRegular == null) { authorNamesRegular = new List <string>(); } authorNamesRegular.AddRange(tempAuthNames); authorsLineNo = i; if (isAuthSure) { titleEndIsMet = true; } } else if (titleLineNo >= 0 && !titleEndIsMet && authorsLineNo < 0) { titleLineEnd = i; } } if (titleLineEnd < 0 && titleLineNo >= 0 && authorsLineNo > titleLineNo) { titleLineEnd = authorsLineNo - 1; } string strTitle = "null", strAuthors = "null"; if (titleLineNo >= 0) { if (titleLineNo == titleLineEnd) { strTitle = firstPartData.LineContents[titleLineNo]; } else { strTitle = ""; for (int i = titleLineNo; i <= titleLineEnd; i++) { strTitle += firstPartData.LineContents[i] + " "; } strTitle = strTitle.Trim(); } } if (authorsLineNo >= 0) { strAuthors = firstPartData.LineContents[authorsLineNo]; } return(new PdfExtractedData { AuthorNames = isAuthsByEmail ? authorNamesByEmail : authorNamesRegular, AuthorsLine = strAuthors, Title = strTitle, Year = year, PubName = pubName }); }