Uncategorized

How do I extract linked excel files inside an excel workbook using python?


I have an Excel workbook with some sheets containing data in text/tables of insurance plan details, some sheets containing images/screenshots of the details and some containing linked Excel files (inserted as objects).

Using python how do I iterate through each sheet and if there are linked excel file I would like to store them in another directory?

I tried this code:

import os
import shutil
import pathlib
import zipfile
import openpyxl
import re

def extract_linked_excel_files(path, output_folder_name="extracted_excel_files"):
    """
    Extracts linked Excel files from an Excel file and stores them in a single folder.

    Args:
        path (pathlib.Path or str): Excel file path.
        output_folder_name (str): Name of the folder to store the extracted Excel files.
            Defaults to 'extracted_excel_files'.

    Returns:
        new_paths (list[pathlib.Path]): List of paths to the extracted Excel files.
    """
    # Convert path to pathlib.Path if it's a string
    if isinstance(path, str):
        path = pathlib.Path(path)

    # Check if the file has the '.xlsx' extension
    if path.suffix != '.xlsx':
        raise ValueError('Path must be an xlsx file')

    # Extract the filename (excluding the extension) using .stem
    name = path.stem

    # Create a new folder for the extracted Excel files
    output_folder = path.parent / output_folder_name
    output_folder.mkdir(exist_ok=True)  # Create folder if it doesn't exist

    # Open the workbook
    workbook = openpyxl.load_workbook(path, read_only=True)

    # List to store the paths of the extracted Excel files
    new_paths = []

    try:
        # Iterate through all sheets in the workbook
        for sheet_name in workbook.sheetnames:
            sheet = workbook[sheet_name]

            # Iterate through all rows in the sheet
            for row in sheet.iter_rows():
                # Iterate through all cells in the row
                for cell in row:
                    # Check if the cell contains a hyperlink
                    if cell.hyperlink:
                        # Check if the hyperlink is an Excel file
                        hyperlink_target = cell.hyperlink.target
                        if re.search(r'\.xlsx$', hyperlink_target, re.IGNORECASE):
                            linked_file_path = hyperlink_target.replace("https://stackoverflow.com/", os.path.sep)

                            # Construct paths for the linked Excel file and the new destination
                            linked_file_full_path = path.parent / linked_file_path
                            new_path = output_folder / linked_file_path

                            # Copy the linked Excel file to the output folder
                            shutil.copy(linked_file_full_path, new_path)

                            # Store the new path in the list
                            new_paths.append(new_path)

    finally:
        # Close the workbook
        workbook.close()

    # Return the list of paths to the extracted Excel files
    return new_paths

However this is just returning an empty workbook`



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *