mirror of
				https://github.com/kevinveenbirkenbach/directory-content-scanner.git
				synced 2025-11-04 03:48:22 +00:00 
			
		
		
		
	Added binary ignoration
This commit is contained in:
		
							
								
								
									
										7
									
								
								cli.py
									
									
									
									
									
								
							
							
						
						
									
										7
									
								
								cli.py
									
									
									
									
									
								
							@@ -5,14 +5,15 @@ def parse_arguments():
 | 
				
			|||||||
        description="Scan directories and print/compile file contents."
 | 
					        description="Scan directories and print/compile file contents."
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    parser.add_argument("paths", nargs='+', help="List of files or directories to scan.")
 | 
					    parser.add_argument("paths", nargs='+', help="List of files or directories to scan.")
 | 
				
			||||||
    parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt .log).")
 | 
					    parser.add_argument("--file-types", nargs='+', default=[], help="Filter by file types (e.g., .txt, .log).")
 | 
				
			||||||
    parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.")
 | 
					    parser.add_argument("--ignore-file-strings", nargs='+', default=[], help="Ignore files and folders containing these strings.")
 | 
				
			||||||
    parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.")
 | 
					    parser.add_argument("--ignore-hidden", action='store_true', help="Ignore hidden directories and files.")
 | 
				
			||||||
    parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.")
 | 
					    parser.add_argument("-v", "--verbose", action='store_true', help="Enable verbose mode.")
 | 
				
			||||||
    parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.")
 | 
					    parser.add_argument("--no-comments", action='store_true', help="Remove comments from the displayed content based on file type.")
 | 
				
			||||||
    parser.add_argument("--compress", action='store_true', help="Compress code (for Python files).")
 | 
					    parser.add_argument("--compress", action='store_true', help="Compress code (for supported file types).")
 | 
				
			||||||
    parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.")
 | 
					    parser.add_argument("--path-contains", nargs='+', default=[], help="Display files whose paths contain one of these strings.")
 | 
				
			||||||
    parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.")
 | 
					    parser.add_argument("--content-contains", nargs='+', default=[], help="Display files containing one of these strings in their content.")
 | 
				
			||||||
    parser.add_argument("--no-gitignore", action='store_true', help="Do not respect .gitignore files during scan.")
 | 
					    parser.add_argument("--no-gitignore", action='store_true', help="Do not respect .gitignore files during scan.")
 | 
				
			||||||
 | 
					    parser.add_argument("--scan-binary-files", action='store_true', help="Scan binary files as well (by default these are ignored).")
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    return parser.parse_args()
 | 
					    return parser.parse_args()
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -20,11 +20,36 @@ class DirectoryHandler:
 | 
				
			|||||||
                        lines = f.readlines()
 | 
					                        lines = f.readlines()
 | 
				
			||||||
                    # Filter out empty lines and comments.
 | 
					                    # Filter out empty lines and comments.
 | 
				
			||||||
                    patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')]
 | 
					                    patterns = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')]
 | 
				
			||||||
                    # Save the base directory and its patterns
 | 
					                    # Save the base directory and its patterns.
 | 
				
			||||||
                    gitignore_data.append((dirpath, patterns))
 | 
					                    gitignore_data.append((dirpath, patterns))
 | 
				
			||||||
                except Exception as e:
 | 
					                except Exception as e:
 | 
				
			||||||
                    print(f"Error reading {gitignore_path}: {e}")
 | 
					                    print(f"Error reading {gitignore_path}: {e}")
 | 
				
			||||||
        return gitignore_data
 | 
					        return gitignore_data
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def is_binary_file(file_path):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Reads the first 1024 bytes of file_path and heuristically determines
 | 
				
			||||||
 | 
					        if the file appears to be binary. This method returns True if a null byte
 | 
				
			||||||
 | 
					        is found or if more than 30% of the bytes in the sample are non-text.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            with open(file_path, 'rb') as f:
 | 
				
			||||||
 | 
					                chunk = f.read(1024)
 | 
				
			||||||
 | 
					            # If there's a null byte, it's almost certainly binary.
 | 
				
			||||||
 | 
					            if b'\x00' in chunk:
 | 
				
			||||||
 | 
					                return True
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            # Define a set of text characters (ASCII printable + common control characters)
 | 
				
			||||||
 | 
					            text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x7F)))
 | 
				
			||||||
 | 
					            # Count non-text characters in the chunk.
 | 
				
			||||||
 | 
					            non_text = sum(byte not in text_chars for byte in chunk)
 | 
				
			||||||
 | 
					            if len(chunk) > 0 and (non_text / len(chunk)) > 0.30:
 | 
				
			||||||
 | 
					                return True
 | 
				
			||||||
 | 
					        except Exception:
 | 
				
			||||||
 | 
					            # If the file cannot be read in binary mode, assume it's not binary.
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def is_gitignored(file_path, gitignore_data):
 | 
					    def is_gitignored(file_path, gitignore_data):
 | 
				
			||||||
@@ -37,12 +62,12 @@ class DirectoryHandler:
 | 
				
			|||||||
            try:
 | 
					            try:
 | 
				
			||||||
                rel_path = os.path.relpath(file_path, base_dir)
 | 
					                rel_path = os.path.relpath(file_path, base_dir)
 | 
				
			||||||
            except ValueError:
 | 
					            except ValueError:
 | 
				
			||||||
                # file_path and base_dir are on different drives
 | 
					                # file_path and base_dir are on different drives.
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            # If the file is not under the current .gitignore base_dir, skip it.
 | 
					            # If the file is not under the current .gitignore base_dir, skip it.
 | 
				
			||||||
            if rel_path.startswith('..'):
 | 
					            if rel_path.startswith('..'):
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            # Check all patterns
 | 
					            # Check all patterns.
 | 
				
			||||||
            for pattern in patterns:
 | 
					            for pattern in patterns:
 | 
				
			||||||
                if pattern.endswith('/'):
 | 
					                if pattern.endswith('/'):
 | 
				
			||||||
                    # Directory pattern: check if any folder in the relative path matches.
 | 
					                    # Directory pattern: check if any folder in the relative path matches.
 | 
				
			||||||
@@ -84,10 +109,15 @@ class DirectoryHandler:
 | 
				
			|||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains):
 | 
					    def should_print_file(file_path, file_types, ignore_file_strings, ignore_hidden, path_contains, content_contains, scan_binary_files=False):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Determines if a file should be printed based on various criteria.
 | 
					        Determines if a file should be printed based on various criteria.
 | 
				
			||||||
 | 
					        By default, binary files are skipped unless scan_binary_files is True.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					        # Check binary file status using our heuristic.
 | 
				
			||||||
 | 
					        if not scan_binary_files and DirectoryHandler.is_binary_file(file_path):
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if ignore_hidden and os.path.basename(file_path).startswith('.'):
 | 
					        if ignore_hidden and os.path.basename(file_path).startswith('.'):
 | 
				
			||||||
            return False
 | 
					            return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -148,7 +178,8 @@ class DirectoryHandler:
 | 
				
			|||||||
                    kwargs['ignore_file_strings'],
 | 
					                    kwargs['ignore_file_strings'],
 | 
				
			||||||
                    kwargs['ignore_hidden'],
 | 
					                    kwargs['ignore_hidden'],
 | 
				
			||||||
                    kwargs['path_contains'],
 | 
					                    kwargs['path_contains'],
 | 
				
			||||||
                    kwargs['content_contains']
 | 
					                    kwargs['content_contains'],
 | 
				
			||||||
 | 
					                    scan_binary_files=kwargs.get('scan_binary_files', False)
 | 
				
			||||||
                ):
 | 
					                ):
 | 
				
			||||||
                    DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
 | 
					                    DirectoryHandler.print_file_content(file_path, kwargs['no_comments'], kwargs['compress'])
 | 
				
			||||||
                elif kwargs.get('verbose'):
 | 
					                elif kwargs.get('verbose'):
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										11
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								main.py
									
									
									
									
									
								
							@@ -19,7 +19,8 @@ def main():
 | 
				
			|||||||
                compress=args.compress,
 | 
					                compress=args.compress,
 | 
				
			||||||
                path_contains=args.path_contains,
 | 
					                path_contains=args.path_contains,
 | 
				
			||||||
                content_contains=args.content_contains,
 | 
					                content_contains=args.content_contains,
 | 
				
			||||||
                no_gitignore=args.no_gitignore
 | 
					                no_gitignore=args.no_gitignore,
 | 
				
			||||||
 | 
					                scan_binary_files=args.scan_binary_files
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        elif os.path.isfile(path):
 | 
					        elif os.path.isfile(path):
 | 
				
			||||||
            if DirectoryHandler.should_print_file(
 | 
					            if DirectoryHandler.should_print_file(
 | 
				
			||||||
@@ -28,7 +29,8 @@ def main():
 | 
				
			|||||||
                ignore_file_strings=args.ignore_file_strings,
 | 
					                ignore_file_strings=args.ignore_file_strings,
 | 
				
			||||||
                ignore_hidden=args.ignore_hidden,
 | 
					                ignore_hidden=args.ignore_hidden,
 | 
				
			||||||
                path_contains=args.path_contains,
 | 
					                path_contains=args.path_contains,
 | 
				
			||||||
                content_contains=args.content_contains
 | 
					                content_contains=args.content_contains,
 | 
				
			||||||
 | 
					                scan_binary_files=args.scan_binary_files
 | 
				
			||||||
            ):
 | 
					            ):
 | 
				
			||||||
                DirectoryHandler.handle_file(
 | 
					                DirectoryHandler.handle_file(
 | 
				
			||||||
                    path,
 | 
					                    path,
 | 
				
			||||||
@@ -36,11 +38,12 @@ def main():
 | 
				
			|||||||
                    ignore_file_strings=args.ignore_file_strings,
 | 
					                    ignore_file_strings=args.ignore_file_strings,
 | 
				
			||||||
                    ignore_hidden=args.ignore_hidden,
 | 
					                    ignore_hidden=args.ignore_hidden,
 | 
				
			||||||
                    no_comments=args.no_comments,
 | 
					                    no_comments=args.no_comments,
 | 
				
			||||||
                    compress=args.compress
 | 
					                    compress=args.compress,
 | 
				
			||||||
 | 
					                    scan_binary_files=args.scan_binary_files
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            print(f"Error: {path} is neither a valid file nor a directory.")
 | 
					            print(f"Error: {path} is neither a valid file nor a directory.")
 | 
				
			||||||
            sys.exit(1)
 | 
					            sys.exit(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    main()
 | 
					    main()
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user