Example #1
0
        private static PdfFirstPartData BuildFirstPartPdfData(string fileContent)
        {
            var stringReader = new StringReader(fileContent);

            string curLine;
            int    lineNo = 0;

            var lineContents = new List <string>();
            var lineLengths  = new List <int>();
            var lineStarts   = new List <int>();
            var wordsPerLine = new List <int>();

            while ((curLine = stringReader.ReadLine()) != null)
            {
                lineNo++;
                if (IsEmptyLine(curLine))
                {
                    lineContents.Add("");
                    lineLengths.Add(0);
                    lineStarts.Add(0);
                    wordsPerLine.Add(0);
                    continue;
                }

                curLine = StringUtils.UnifyString(StringUtils.RemoveLigatures(curLine));
                string mainContent = curLine.Trim();
                int    mainIdx     = curLine.IndexOf(mainContent);
                Debug.Assert(mainIdx >= 0);

                int preLength = mainIdx;
                //int postLength = curLine.Length - (mainIdx + mainContent.Length);

                int numWords = mainContent.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries).Length;

                lineContents.Add(mainContent);
                lineStarts.Add(preLength);
                lineLengths.Add(mainContent.Length);
                wordsPerLine.Add(numWords);

                if (lineNo > 30)
                {
                    break;
                }
            }

            var firstPartData = new PdfFirstPartData()
            {
                LineContents = lineContents,
                LineLengths  = lineLengths,
                LineStarts   = lineStarts,
                WordsPerLine = wordsPerLine
            };

            //firstPartData.PdfStyle = DetectPdfStyle(firstPartData);
            return(firstPartData);
        }
Example #2
0
        private static bool IsJairStyle(PdfFirstPartData firstPartData)
        {
            if (firstPartData.Count <= 4)
            {
                return(false);
            }


            if (firstPartData.LineStarts[0] == 0 && firstPartData.LineContents[0].ToLower().Contains("journal of"))
            {
                if (firstPartData.LineLengths[1] == 0 &&
                    firstPartData.LineLengths[2] == 0 &&
                    firstPartData.LineLengths[3] == 0)
                {
                    return(true);
                }
            }

            return(false);
        }
        private static bool IsJairStyle(PdfFirstPartData firstPartData)
        {
            if (firstPartData.Count <= 4)
                return false;

            if (firstPartData.LineStarts[0] == 0 && firstPartData.LineContents[0].ToLower().Contains("journal of"))
            {
                if (firstPartData.LineLengths[1] == 0 &&
                    firstPartData.LineLengths[2] == 0 &&
                    firstPartData.LineLengths[3] == 0)
                    return true;
            }

            return false;
        }
        private static PdfExtractedData ExtractFromUnknownPdf(PdfFirstPartData firstPartData)
        {
            List<string> authorNamesRegular = null, authorNamesByEmail = null, tempAuthNames = null;
            bool isAuthsByEmail = false;
            bool isAuthSure;
            int year = -1;
            string pubName = null;
            int authorsLineNo = -1;
            int titleLineEnd = -1;
            int titleLineNo = -1;

            int firstNonEmptyLine = FirstNonEmptyLine(firstPartData.WordsPerLine, 0);
            if (firstNonEmptyLine < 0)
                return null;

            bool titleEndIsMet = false;
            titleLineEnd = titleLineNo;

            for (int i = firstNonEmptyLine; i < firstPartData.Count; i++)
            {
                if (firstPartData.WordsPerLine[i] == 0)
                {
                    if(titleLineNo >= 0) titleEndIsMet = true;
                    continue;
                }

                if (titleLineNo < 0 && (StringLooksLikePdfHeader(firstPartData.LineContents[i]) ||
                    (i+1 < firstPartData.Count && StringLooksLikePdfHeader(firstPartData.LineContents[i+1]))))
                {
                    ExtractHeaderInfo(firstPartData.LineContents[i], out year, out pubName);
                    // now skip headers and change i
                    int j = i + 1;
                    for (; j < firstPartData.Count; j++)
                        if (firstPartData.WordsPerLine[j] == 0)
                            break;

                    if(j < firstPartData.Count) // i.e., loop breaked
                    {
                        titleLineNo = FirstNonEmptyLine(firstPartData.WordsPerLine, j);
                        i = titleLineNo;
                    }
                    else
                    {
                        break;
                    }
                }
                else if(titleLineNo < 0)
                {
                    titleLineNo = i;
                }

                // the email part may look like affiliation, so it must be placed above affiliation check
                if (titleLineNo >= 0 && IsAuthorNameEmailPair(firstPartData.LineContents[i], out tempAuthNames))
                {
                    if (!isAuthsByEmail && authorsLineNo >= 0)
                    {
                        if (titleLineEnd < authorsLineNo)
                            titleLineEnd = authorsLineNo;
                    }
                    isAuthsByEmail = true;
                    authorsLineNo = i;
                    if (authorNamesByEmail == null)
                        authorNamesByEmail = new List<string>();

                    authorNamesByEmail.AddRange(tempAuthNames);
                    titleEndIsMet = true;
                }
                else if (titleLineNo >= 0 && StringLooksLikeAbstractHeader(firstPartData.LineContents[i]))
                {
                    break;
                }
                else if (titleLineNo >= 0 && StringLooksLikeAffiliation(firstPartData.LineContents[i]))
                {
                    titleEndIsMet = true;
                    // do nothing yet
                    //break;
                }
                else if (!titleEndIsMet && titleLineNo >= 0 && StringLooksLikeTitle(firstPartData.LineContents[i]))
                {
                    titleLineEnd = i;
                }
                else if (titleLineNo >= 0 && !titleEndIsMet && i >= 1 && firstPartData.LineContents[i - 1].EndsWith(":"))
                {
                    titleLineEnd = i;
                }
                else if (titleLineNo >= 0 && i != titleLineNo && !isAuthsByEmail && StringLooksLikeAuthorNames(firstPartData.LineContents[i], out isAuthSure, out tempAuthNames))
                {
                    // TODO: u may turn back the commented lines below
                    //if (!titleEndIsMet && authorsLineNo >= 0 && authorsLineNo >= titleLineEnd)
                    //{
                    //    if(authorNamesRegular != null)
                    //        authorNamesRegular.Clear();
                    //    titleLineEnd = authorsLineNo;
                    //}
                    if(authorNamesRegular == null)
                        authorNamesRegular = new List<string>();
                    authorNamesRegular.AddRange(tempAuthNames);
                    authorsLineNo = i;

                    if (isAuthSure)
                    {
                        titleEndIsMet = true;
                    }

                }
                else if (titleLineNo >= 0 && !titleEndIsMet && authorsLineNo < 0)
                {
                    titleLineEnd = i;
                }
            }

            if (titleLineEnd < 0 && titleLineNo >= 0 && authorsLineNo > titleLineNo)
                titleLineEnd = authorsLineNo - 1;

            string strTitle = "null", strAuthors = "null";
            if (titleLineNo >= 0)
            {
                if (titleLineNo == titleLineEnd)
                {
                    strTitle = firstPartData.LineContents[titleLineNo];
                }
                else
                {
                    strTitle = "";
                    for (int i = titleLineNo; i <= titleLineEnd; i++)
                    {
                        strTitle += firstPartData.LineContents[i] + " ";
                    }

                    strTitle = strTitle.Trim();
                }
            }

            if (authorsLineNo >= 0)
            {
                strAuthors = firstPartData.LineContents[authorsLineNo];
            }

            return new PdfExtractedData { AuthorNames = isAuthsByEmail ? authorNamesByEmail : authorNamesRegular,
                AuthorsLine = strAuthors, Title = strTitle, Year = year, PubName = pubName };
        }
        private static PdfFirstPartData BuildFirstPartPdfData(string fileContent)
        {
            var stringReader = new StringReader(fileContent);

            string curLine;
            int lineNo = 0;

            var lineContents = new List<string>();
            var lineLengths = new List<int>();
            var lineStarts = new List<int>();
            var wordsPerLine = new List<int>();

            while ((curLine = stringReader.ReadLine()) != null)
            {
                lineNo++;
                if (IsEmptyLine(curLine))
                {
                    lineContents.Add("");
                    lineLengths.Add(0);
                    lineStarts.Add(0);
                    wordsPerLine.Add(0);
                    continue;
                }

                curLine = StringUtils.UnifyString(StringUtils.RemoveLigatures(curLine));
                string mainContent = curLine.Trim();
                int mainIdx = curLine.IndexOf(mainContent);
                Debug.Assert(mainIdx >= 0);

                int preLength = mainIdx;
                //int postLength = curLine.Length - (mainIdx + mainContent.Length);

                int numWords = mainContent.Split(new[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries).Length;

                lineContents.Add(mainContent);
                lineStarts.Add(preLength);
                lineLengths.Add(mainContent.Length);
                wordsPerLine.Add(numWords);

                if (lineNo > 30)
                    break;
            }

            var firstPartData = new PdfFirstPartData()
            {
                LineContents = lineContents,
                LineLengths = lineLengths,
                LineStarts = lineStarts,
                WordsPerLine = wordsPerLine
            };

            //firstPartData.PdfStyle = DetectPdfStyle(firstPartData);
            return firstPartData;
        }
Example #6
0
        private static PdfExtractedData ExtractFromUnknownPdf(PdfFirstPartData firstPartData)
        {
            List <string> authorNamesRegular = null, authorNamesByEmail = null, tempAuthNames = null;
            bool          isAuthsByEmail = false;
            bool          isAuthSure;
            int           year          = -1;
            string        pubName       = null;
            int           authorsLineNo = -1;
            int           titleLineEnd  = -1;
            int           titleLineNo   = -1;

            int firstNonEmptyLine = FirstNonEmptyLine(firstPartData.WordsPerLine, 0);

            if (firstNonEmptyLine < 0)
            {
                return(null);
            }

            bool titleEndIsMet = false;

            titleLineEnd = titleLineNo;

            for (int i = firstNonEmptyLine; i < firstPartData.Count; i++)
            {
                if (firstPartData.WordsPerLine[i] == 0)
                {
                    if (titleLineNo >= 0)
                    {
                        titleEndIsMet = true;
                    }
                    continue;
                }

                if (titleLineNo < 0 && (StringLooksLikePdfHeader(firstPartData.LineContents[i]) ||
                                        (i + 1 < firstPartData.Count && StringLooksLikePdfHeader(firstPartData.LineContents[i + 1]))))
                {
                    ExtractHeaderInfo(firstPartData.LineContents[i], out year, out pubName);
                    // now skip headers and change i
                    int j = i + 1;
                    for (; j < firstPartData.Count; j++)
                    {
                        if (firstPartData.WordsPerLine[j] == 0)
                        {
                            break;
                        }
                    }

                    if (j < firstPartData.Count) // i.e., loop breaked
                    {
                        titleLineNo = FirstNonEmptyLine(firstPartData.WordsPerLine, j);
                        i           = titleLineNo;
                    }
                    else
                    {
                        break;
                    }
                }
                else if (titleLineNo < 0)
                {
                    titleLineNo = i;
                }

                // the email part may look like affiliation, so it must be placed above affiliation check
                if (titleLineNo >= 0 && IsAuthorNameEmailPair(firstPartData.LineContents[i], out tempAuthNames))
                {
                    if (!isAuthsByEmail && authorsLineNo >= 0)
                    {
                        if (titleLineEnd < authorsLineNo)
                        {
                            titleLineEnd = authorsLineNo;
                        }
                    }
                    isAuthsByEmail = true;
                    authorsLineNo  = i;
                    if (authorNamesByEmail == null)
                    {
                        authorNamesByEmail = new List <string>();
                    }

                    authorNamesByEmail.AddRange(tempAuthNames);
                    titleEndIsMet = true;
                }
                else if (titleLineNo >= 0 && StringLooksLikeAbstractHeader(firstPartData.LineContents[i]))
                {
                    break;
                }
                else if (titleLineNo >= 0 && StringLooksLikeAffiliation(firstPartData.LineContents[i]))
                {
                    titleEndIsMet = true;
                    // do nothing yet
                    //break;
                }
                else if (!titleEndIsMet && titleLineNo >= 0 && StringLooksLikeTitle(firstPartData.LineContents[i]))
                {
                    titleLineEnd = i;
                }
                else if (titleLineNo >= 0 && !titleEndIsMet && i >= 1 && firstPartData.LineContents[i - 1].EndsWith(":"))
                {
                    titleLineEnd = i;
                }
                else if (titleLineNo >= 0 && i != titleLineNo && !isAuthsByEmail && StringLooksLikeAuthorNames(firstPartData.LineContents[i], out isAuthSure, out tempAuthNames))
                {
                    // TODO: u may turn back the commented lines below
                    //if (!titleEndIsMet && authorsLineNo >= 0 && authorsLineNo >= titleLineEnd)
                    //{
                    //    if(authorNamesRegular != null)
                    //        authorNamesRegular.Clear();
                    //    titleLineEnd = authorsLineNo;
                    //}
                    if (authorNamesRegular == null)
                    {
                        authorNamesRegular = new List <string>();
                    }
                    authorNamesRegular.AddRange(tempAuthNames);
                    authorsLineNo = i;

                    if (isAuthSure)
                    {
                        titleEndIsMet = true;
                    }
                }
                else if (titleLineNo >= 0 && !titleEndIsMet && authorsLineNo < 0)
                {
                    titleLineEnd = i;
                }
            }

            if (titleLineEnd < 0 && titleLineNo >= 0 && authorsLineNo > titleLineNo)
            {
                titleLineEnd = authorsLineNo - 1;
            }

            string strTitle = "null", strAuthors = "null";

            if (titleLineNo >= 0)
            {
                if (titleLineNo == titleLineEnd)
                {
                    strTitle = firstPartData.LineContents[titleLineNo];
                }
                else
                {
                    strTitle = "";
                    for (int i = titleLineNo; i <= titleLineEnd; i++)
                    {
                        strTitle += firstPartData.LineContents[i] + " ";
                    }

                    strTitle = strTitle.Trim();
                }
            }

            if (authorsLineNo >= 0)
            {
                strAuthors = firstPartData.LineContents[authorsLineNo];
            }

            return(new PdfExtractedData {
                AuthorNames = isAuthsByEmail ? authorNamesByEmail : authorNamesRegular,
                AuthorsLine = strAuthors, Title = strTitle, Year = year, PubName = pubName
            });
        }