I am not proud of this…
Take a resume (or “CV” outside of the US) and return all the text within it in a formatted way. Right now the below script outputs it to a txt file.
I used this guide as inspiration and have looked to improve on it, by using slightly saner control flow and functions. Although the script works as intended, there are quite a lot of things that smell very bad.
Bad Things:
- Horrendous
main()
- Kind of crazy looping (lots of nested indentations which is usually a bad sign).
- 2D structures and LOTS of lists (again! A bad sign).
- No use of
yield
so materialising a lot in memory. - No use of
@dataclass
/NamedTuple
(I feel like I should be modelling the PDFPage at least). - Could this be vectorised?
- Converting it to an object-oriented design seems like a OK idea.
- Dumb statements like using
pass
andif table_in_page == -1
- PEP8 Violations
I am lacking creativity to get this to an elegant solution and thought I would add it here to see if there are any fresh minds that want to rework it.
The Code
from typing import Any, Optional
import pdfplumber
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTPage, LTTextContainer, LTChar
def text_extraction(element: LTTextContainer) -> tuple[str, list[str]]:
"""
Extracts text and unique formats (font names and sizes) from a given element.
Parameters:
element (LTTextContainer): The element from which text and formats are extracted.
Returns:
tuple[str, list[str]]: A tuple containing the extracted text and a list of unique formats.
"""
line_text = element.get_text()
line_formats = set()
for text_line in element:
if isinstance(text_line, LTTextContainer):
for character in text_line:
if isinstance(character, LTChar):
format_info = f"{character.fontname}, {character.size}"
line_formats.add(format_info)
format_per_line = list(line_formats)
return line_text, format_per_line
def extract_table(pdf_path: str, page_num: int, table_num: int) -> Optional[list[list[str]]]:
"""
Extracts a specified table from a given page of a PDF document.
Parameters:
pdf_path (str): The file path of the PDF document.
page_num (int): The page number from which to extract the table.
table_num (int): The index of the table on the page to extract.
Returns:
Optional[list[list[str]]]: A 2D list representing the extracted table, or None if an error occurs.
"""
try:
with pdfplumber.open(pdf_path) as pdf:
# Check if the page number is valid
if page_num < 0 or page_num >= len(pdf.pages):
raise ValueError("Page number out of range.")
table_page = pdf.pages[page_num]
tables = table_page.extract_tables()
# Check if the table number is valid
if table_num < 0 or table_num >= len(tables):
raise ValueError("Table number out of range.")
return tables[table_num]
except Exception as e:
print(f"An error occurred: {e}")
return None
def table_converter(table: list[list[str]]) -> str:
"""
Converts a 2D table into a string format, where each cell is separated by '|'
and each row is on a new line. Newline characters in cells are replaced with spaces,
and None values are converted to the string 'None'.
Parameters:
table (list[list[str]]): The 2D table to convert.
Returns:
str: The string representation of the table.
Example usage:
table = [['Name', 'Age'], ['Alice', '23'], ['Bob', None]]
print(table_converter(table))
"""
converted_rows = []
for row in table:
cleaned_row = [
item.replace('\n', ' ') if item is not None else 'None'
for item in row
]
converted_rows.append('|' + '|'.join(cleaned_row) + '|')
return '\n'.join(converted_rows)
def is_element_inside_any_table(element, page: LTPage, tables: list[Any]) -> bool:
"""
Checks whether a given element is inside any of the tables on a PDF page.
Parameters:
element: The element to check.
page (LTPage): The PDF page.
tables (List[Any]): A list of tables, where each table is an object with a bounding box.
Returns:
bool: True if the element is inside any of the tables, False otherwise.
"""
x0, y0up, x1, y1up = element.bbox
page_height = page.bbox[3]
# Transform coordinates
y0, y1 = page_height - y1up, page_height - y0up
for table in tables:
tx0, ty0, tx1, ty1 = table.bbox
# Check if element bbox is inside table bbox
if tx0 <= x0 < x1 <= tx1 and ty0 <= y0 < y1 <= ty1:
return True
return False
def find_table_for_element(element, page: LTPage, tables: list[Any]) -> Optional[int]:
"""
Finds the index of the table that a given element is inside on a PDF page.
Parameters:
element: The element to check.
page (LTPage): The PDF page.
tables (list[Any]): A list of tables, where each table is an object with a bounding box.
Returns:
Optional[int]: The index of the table that contains the element, or None if not found.
"""
x0, y0up, x1, y1up = element.bbox
page_height = page.bbox[3]
# Transform coordinates
y0, y1 = page_height - y1up, page_height - y0up
for i, table in enumerate(tables):
tx0, ty0, tx1, ty1 = table.bbox
if tx0 <= x0 < x1 <= tx1 and ty0 <= y0 < y1 <= ty1:
return i # Return the index of the table
return None
def process_tables(tables, pdf_path, pagenum, text_from_tables):
# Extracting the tables of the page
for table_num in range(len(tables)):
# Extract the information of the table
table = extract_table(pdf_path, pagenum, table_num)
# Convert the table information in structured string format
table_string = table_converter(table)
# Append the table string into a list
text_from_tables.append(table_string)
def process_text_element(element, page_text, line_format, page_content):
# Check if the element is text element
if isinstance(element, LTTextContainer):
# Use the function to extract the text and format for each text element
(line_text, format_per_line) = text_extraction(element)
# Append the text of each line to the page text
page_text.append(line_text)
# Append the format for each line containing text
line_format.append(format_per_line)
page_content.append(line_text)
return line_format, page_content
def main(filepath: str) -> None:
pdf = open(filepath, 'rb')
text_per_page = {}
# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(filepath)):
# Initialize the variables needed for the text extraction from the page
page_text = []
line_format = []
text_from_images = []
text_from_tables = []
page_content = []
# Initialize the number of the examined tables
table_in_page = -1
pdf = pdfplumber.open(pdf_path)
page_tables = pdf.pages[pagenum]
tables = page_tables.find_tables()
if len(tables) != 0:
table_in_page = 0
process_tables(tables, filepath, pagenum, text_from_tables)
# Find all the elements
page_elements = [(element.y1, element) for element in page._objs]
# Sort all the element as they appear in the page
page_elements.sort(key=lambda a: a[0], reverse=True)
# Find the elements that composed a page
for i, component in enumerate(page_elements):
# Extract the element of the page layout
element = component[1]
# Check the elements for tables
if table_in_page == -1:
pass
else:
if is_element_inside_any_table(element, page, tables):
table_found = find_table_for_element(element, page,tables)
if table_found == table_in_page and table_found is not None:
page_content.append(text_from_tables[table_in_page])
page_text.append('table')
line_format.append('table')
table_in_page += 1
# Pass this iteration because the content of this element was extracted from the tables
continue
if not is_element_inside_any_table(element, page, tables):
line_format, page_content = process_text_element(element, page_text, line_format, page_content)
# Add the list of list as value of the page key
text_per_page[f'Page_{pagenum}'] = [page_text, line_format, text_from_images, text_from_tables, page_content]
# Close the pdf file object
pdf.close()
# For now just write to file.
result="".join(text_per_page['Page_0'][4])
with open("/path/to/processed-resume.pdf.txt", "w") as text_file:
text_file.write(result)
# TODO: this needs a lot of refinement.
if __name__ == "__main__":
pdf_path="/path/to/any/test-resume.pdf"
main(pdf_path)