How to make this Python script to run subfolders too?

2024/10/5 19:10:01

Which part of the codes do I need to change in order to include subfolders?


import glob
import os
import sys
from typing import Listdef get_filenames(filepath: str, pattern: str) -> List[str]:"""Returns all filenames that matches the pattern in current folder.Args:filepath (str): folder path.pattern (str): filename pattern.Returns:List[str]: list of paths."""filenames = glob.glob(os.path.join(filepath, pattern))if filenames:return filenamesreturn sys.exit("Error: no file found, check the documentation for more info.")

import mathimport clickimport pdf_split_tool.file_handler
import pdf_split_tool.pdf_splitterdef _confirm_split_file(filepath: str, max_size_bytes: int) -> None:"""Split file if user confirms or is valid.Args:filepath: PDF path.max_size_bytes: max size in bytes."""splitter = pdf_split_tool.pdf_splitter.PdfSplitter(filepath)valid = Trueif not valid:click.secho(("Warning: {} has more than 200kb per page. ""Consider reducing resolution before splitting.").format(filepath),fg="yellow",)if not click.confirm("Do you want to continue?"):click.secho("{} skipped.".format(filepath), fg="blue")returnsplitter.split_max_size(max_size_bytes)@click.command()
@click.argument("filepath", type=click.Path(exists=True), default=".")
@click.option("-m","--max-size",type=float,help="Max size in megabytes.",default=20,show_default=True,
def main(filepath: str, max_size: float) -> None:"""Pdf Split Tool."""max_size_bytes = math.floor(max_size * 1024 * 1024)  # convert to bytesif filepath.endswith(".pdf"):_confirm_split_file(filepath, max_size_bytes)else:filepaths = pdf_split_tool.file_handler.get_filenames(filepath, "*.pdf")for path in filepaths:_confirm_split_file(path, max_size_bytes)if __name__ == "__main__":main(prog_name="pdf-split-tool")  # pragma: no cover

import os
import sys
import tempfileimport PyPDF4class PdfSplitter:"""Pdf Splitter class."""def __init__(self, filepath: str) -> None:"""Constructor."""self.filepath = filepathself.input_pdf = PyPDF4.PdfFileReader(filepath, "rb")self.total_pages = self.input_pdf.getNumPages()self.size = os.path.getsize(filepath)self.avg_size = self.size / self.total_pagesprint("File: {}\nFile size: {}\nTotal pages: {}\nAverage size: {}".format(filepath, self.size, self.total_pages, self.avg_size))def _get_pdf_size(self, pdf_writer: PyPDF4.PdfFileWriter) -> int:"""Generates temporary PDF.Args:pdf_writer: pdf writer.Returns:int: generated file size."""with tempfile.TemporaryFile(mode="wb") as fp:pdf_writer.write(fp)return fp.tell()def split_max_size(self, max_size: int) -> int:"""Creates new files based on max size.Args:max_size: size in integer megabytes.Returns:int: number of PDFs created."""if self.size > max_size:avg_step = int(max_size / self.avg_size)pdfs_count = 0current_page = 0while current_page != self.total_pages:end_page = current_page + avg_stepif end_page > self.total_pages:end_page = self.total_pagescurrent_size = sys.maxsize# while PDF is too big create smaller PDFswhile current_size > max_size:pdf_writer = PyPDF4.PdfFileWriter()for page in range(current_page, end_page):pdf_writer.addPage(self.input_pdf.getPage(page))current_size = self._get_pdf_size(pdf_writer)self.input_pdf = PyPDF4.PdfFileReader(self.filepath, "rb")end_page -= 1# write PDF with size max_sizewith open(self.filepath.replace(".pdf", "-{}.pdf".format(pdfs_count)), "wb") as out:pdf_writer.write(out)current_page = end_page + 1pdfs_count += 1return pdfs_countreturn 0

What you could do is, for each file in filenames, check if it's a folder, if it is, rerun the function on it, using recursivity

To check wether a file is a folder or not you can use


where path is the path to the file

EDIT: Posting the code is better than an image because it can help people showing you the solution without having to rewrite everything


You could try doing that, tried it myself and it should hopefully do what you want

def get_filenames(filepath, pattern, file_list=None):if file_list is None:file_list = []filenames = glob.glob(os.path.join(filepath, pattern))if filenames:for file in filenames:file_list.append(file)if os.path.isdir(file): # If it's a folder, rerun the function to get every file inside it recursivelyreturn get_filenames(file, pattern, file_list)return file_list

