Uncategorized

Parsing PDFs into Python structures


I am not proud of this…

Take a resume (or “CV” outside of the US) and return all the text within it in a formatted way. Right now the below script outputs it to a txt file.
I used this guide as inspiration and have looked to improve on it, by using slightly saner control flow and functions. Although the script works as intended, there are quite a lot of things that smell very bad.

Bad Things:

  1. Horrendous main()
  2. Kind of crazy looping (lots of nested indentations which is usually a bad sign).
  3. 2D structures and LOTS of lists (again! A bad sign).
  4. No use of yield so materialising a lot in memory.
  5. No use of @dataclass/NamedTuple (I feel like I should be modelling the PDFPage at least).
  6. Could this be vectorised?
  7. Converting it to an object-oriented design seems like a OK idea.
  8. Dumb statements like using pass and if table_in_page == -1
  9. PEP8 Violations

I am lacking creativity to get this to an elegant solution and thought I would add it here to see if there are any fresh minds that want to rework it.

The Code

from typing import Any, Optional

import pdfplumber
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTPage, LTTextContainer, LTChar


def text_extraction(element: LTTextContainer) -> tuple[str, list[str]]:
    """
    Extracts text and unique formats (font names and sizes) from a given element.

    Parameters:
        element (LTTextContainer): The element from which text and formats are extracted.

    Returns:
        tuple[str, list[str]]: A tuple containing the extracted text and a list of unique formats.
    """
    line_text = element.get_text()
    line_formats = set()

    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            for character in text_line:
                if isinstance(character, LTChar):
                    format_info = f"{character.fontname}, {character.size}"
                    line_formats.add(format_info)

    format_per_line = list(line_formats)
    return line_text, format_per_line


def extract_table(pdf_path: str, page_num: int, table_num: int) -> Optional[list[list[str]]]:
    """
    Extracts a specified table from a given page of a PDF document.

    Parameters:
        pdf_path (str): The file path of the PDF document.
        page_num (int): The page number from which to extract the table.
        table_num (int): The index of the table on the page to extract.

    Returns:
        Optional[list[list[str]]]: A 2D list representing the extracted table, or None if an error occurs.
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Check if the page number is valid
            if page_num < 0 or page_num >= len(pdf.pages):
                raise ValueError("Page number out of range.")

            table_page = pdf.pages[page_num]
            tables = table_page.extract_tables()

            # Check if the table number is valid
            if table_num < 0 or table_num >= len(tables):
                raise ValueError("Table number out of range.")

            return tables[table_num]
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def table_converter(table: list[list[str]]) -> str:
    """
    Converts a 2D table into a string format, where each cell is separated by '|'
    and each row is on a new line. Newline characters in cells are replaced with spaces,
    and None values are converted to the string 'None'.

    Parameters:
        table (list[list[str]]): The 2D table to convert.

    Returns:
        str: The string representation of the table.

    Example usage:
        table = [['Name', 'Age'], ['Alice', '23'], ['Bob', None]]
        print(table_converter(table))
    """
    converted_rows = []
    for row in table:
        cleaned_row = [
            item.replace('\n', ' ') if item is not None else 'None'
            for item in row
        ]
        converted_rows.append('|' + '|'.join(cleaned_row) + '|')

    return '\n'.join(converted_rows)


def is_element_inside_any_table(element, page: LTPage, tables: list[Any]) -> bool:
    """
    Checks whether a given element is inside any of the tables on a PDF page.

    Parameters:
        element: The element to check.
        page (LTPage): The PDF page.
        tables (List[Any]): A list of tables, where each table is an object with a bounding box.

    Returns:
        bool: True if the element is inside any of the tables, False otherwise.
    """
    x0, y0up, x1, y1up = element.bbox
    page_height = page.bbox[3]
    # Transform coordinates
    y0, y1 = page_height - y1up, page_height - y0up

    for table in tables:
        tx0, ty0, tx1, ty1 = table.bbox
        # Check if element bbox is inside table bbox
        if tx0 <= x0 < x1 <= tx1 and ty0 <= y0 < y1 <= ty1:
            return True

    return False


def find_table_for_element(element, page: LTPage, tables: list[Any]) -> Optional[int]:
    """
    Finds the index of the table that a given element is inside on a PDF page.

    Parameters:
        element: The element to check.
        page (LTPage): The PDF page.
        tables (list[Any]): A list of tables, where each table is an object with a bounding box.

    Returns:
        Optional[int]: The index of the table that contains the element, or None if not found.
    """
    x0, y0up, x1, y1up = element.bbox
    page_height = page.bbox[3]
    # Transform coordinates
    y0, y1 = page_height - y1up, page_height - y0up

    for i, table in enumerate(tables):
        tx0, ty0, tx1, ty1 = table.bbox
        if tx0 <= x0 < x1 <= tx1 and ty0 <= y0 < y1 <= ty1:
            return i  # Return the index of the table

    return None


def process_tables(tables, pdf_path, pagenum, text_from_tables):
    # Extracting the tables of the page
    for table_num in range(len(tables)):
        # Extract the information of the table
        table = extract_table(pdf_path, pagenum, table_num)
        # Convert the table information in structured string format
        table_string = table_converter(table)
        # Append the table string into a list
        text_from_tables.append(table_string)


def process_text_element(element, page_text, line_format, page_content):
    # Check if the element is text element
    if isinstance(element, LTTextContainer):
        # Use the function to extract the text and format for each text element
        (line_text, format_per_line) = text_extraction(element)
        # Append the text of each line to the page text
        page_text.append(line_text)
        # Append the format for each line containing text
        line_format.append(format_per_line)
        page_content.append(line_text)

    return line_format, page_content


def main(filepath: str) -> None:
    pdf = open(filepath, 'rb')
    text_per_page = {}

    # We extract the pages from the PDF
    for pagenum, page in enumerate(extract_pages(filepath)):

        # Initialize the variables needed for the text extraction from the page
        page_text = []
        line_format = []
        text_from_images = []
        text_from_tables = []
        page_content = []

        # Initialize the number of the examined tables
        table_in_page = -1
        pdf = pdfplumber.open(pdf_path)
        page_tables = pdf.pages[pagenum]
        tables = page_tables.find_tables()
        if len(tables) != 0:
            table_in_page = 0

        process_tables(tables, filepath, pagenum, text_from_tables)

        # Find all the elements
        page_elements = [(element.y1, element) for element in page._objs]
        # Sort all the element as they appear in the page
        page_elements.sort(key=lambda a: a[0], reverse=True)

        # Find the elements that composed a page
        for i, component in enumerate(page_elements):
            # Extract the element of the page layout
            element = component[1]

            # Check the elements for tables
            if table_in_page == -1:
                pass
            else:
                if is_element_inside_any_table(element, page, tables):
                    table_found = find_table_for_element(element, page,tables)
                    if table_found == table_in_page and table_found is not None:
                        page_content.append(text_from_tables[table_in_page])
                        page_text.append('table')
                        line_format.append('table')
                        table_in_page += 1

                    # Pass this iteration because the content of this element was extracted from the tables
                    continue

            if not is_element_inside_any_table(element, page, tables):
                line_format, page_content = process_text_element(element, page_text, line_format, page_content)

        # Add the list of list as value of the page key
        text_per_page[f'Page_{pagenum}'] = [page_text, line_format, text_from_images, text_from_tables, page_content]

    # Close the pdf file object
    pdf.close()

    # For now just write to file.
    result="".join(text_per_page['Page_0'][4])
    with open("/path/to/processed-resume.pdf.txt", "w") as text_file:
        text_file.write(result)


# TODO: this needs a lot of refinement.
if __name__ == "__main__":
    pdf_path="/path/to/any/test-resume.pdf"
    main(pdf_path)



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *