I have an Excel workbook with some sheets containing data in text/tables of insurance plan details, some sheets containing images/screenshots of the details and some containing linked Excel files (inserted as objects).
Using python how do I iterate through each sheet and if there are linked excel file I would like to store them in another directory?
I tried this code:
import os
import shutil
import pathlib
import zipfile
import openpyxl
import re
def extract_linked_excel_files(path, output_folder_name="extracted_excel_files"):
"""
Extracts linked Excel files from an Excel file and stores them in a single folder.
Args:
path (pathlib.Path or str): Excel file path.
output_folder_name (str): Name of the folder to store the extracted Excel files.
Defaults to 'extracted_excel_files'.
Returns:
new_paths (list[pathlib.Path]): List of paths to the extracted Excel files.
"""
# Convert path to pathlib.Path if it's a string
if isinstance(path, str):
path = pathlib.Path(path)
# Check if the file has the '.xlsx' extension
if path.suffix != '.xlsx':
raise ValueError('Path must be an xlsx file')
# Extract the filename (excluding the extension) using .stem
name = path.stem
# Create a new folder for the extracted Excel files
output_folder = path.parent / output_folder_name
output_folder.mkdir(exist_ok=True) # Create folder if it doesn't exist
# Open the workbook
workbook = openpyxl.load_workbook(path, read_only=True)
# List to store the paths of the extracted Excel files
new_paths = []
try:
# Iterate through all sheets in the workbook
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
# Iterate through all rows in the sheet
for row in sheet.iter_rows():
# Iterate through all cells in the row
for cell in row:
# Check if the cell contains a hyperlink
if cell.hyperlink:
# Check if the hyperlink is an Excel file
hyperlink_target = cell.hyperlink.target
if re.search(r'\.xlsx$', hyperlink_target, re.IGNORECASE):
linked_file_path = hyperlink_target.replace("https://stackoverflow.com/", os.path.sep)
# Construct paths for the linked Excel file and the new destination
linked_file_full_path = path.parent / linked_file_path
new_path = output_folder / linked_file_path
# Copy the linked Excel file to the output folder
shutil.copy(linked_file_full_path, new_path)
# Store the new path in the list
new_paths.append(new_path)
finally:
# Close the workbook
workbook.close()
# Return the list of paths to the extracted Excel files
return new_paths
However this is just returning an empty workbook`