Normalize file names
Abstract
Normalize file names on Ubuntu 12.04.
The following script and command:
- Normalize unicode characters. E.g. image①.jpg -> image1.jpg.
- Make the extension of a basename lower case. E.g. image.JPG -> image.jpg
- Strip the root of a basename. E.g. " image .JPG" -> image.jpg
- Replace multiple spaces with a single space. E.g. "image file 001.jpg" -> "image file 001.jpg"
Code
normalize_name.py
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (c) 2013 Shinya # The MIT License (MIT) http://opensource.org/licenses/mit-license.php import os import re import sys import unicodedata def _unicode_to_str(text): if isinstance(text, unicode): return text.encode('utf-8') return text def _str_to_unicode(text): if isinstance(text, str): return text.decode('utf-8') return text def _normalize_multiple_spaces(name): """Replaces multiple spaces with a single space. >>> _normalize_multiple_spaces('\t name having \t multiple spaces') ' name having multiple spaces' """ return re.sub('\s+', ' ', name) def _remove_special_characters(name): """Removes special characters like a slash. >>> _remove_special_characters('name having a slash / character') 'name having a slash character' """ return name.replace(os.path.sep, '') def _normalize_basename(basename): """Normalizes a basename. Replaces multiple spaces with a single space. >>> _normalize_basename('name having\t multiple spaces') 'name having multiple spaces' Strips heading and tailing spaces. >>> _normalize_basename(' name starting and ending with spaces ') 'name starting and ending with spaces' Make the extension of a basename lower cases. >>> _normalize_basename('name.JPG') 'name.jpg' Strips the root of a basename. >>> _normalize_basename(' name .jpg') 'name.jpg' Normalizes unicode characters. E.g. ① -> 1 >>> _normalize_basename('name①.jpg') 'name1.jpg' """ norm_basename = _str_to_unicode(basename) norm_basename = unicodedata.normalize('NFKC', norm_basename) norm_basename = _remove_special_characters(norm_basename) norm_basename = os.path.normcase(norm_basename) norm_basename = _normalize_multiple_spaces(norm_basename) (root, ext) = os.path.splitext(norm_basename) return _unicode_to_str(root.strip() + ext.strip().lower()) def _main(): if len(sys.argv) != 2: sys.exit('Usage: $ ./normalize_name.py <target_file>') src_path = sys.argv[1] (src_dir, src_basename) = os.path.split(os.path.normpath(src_path)) norm_basename = _normalize_basename(src_basename) if src_basename != norm_basename: sys.stdout.write('"%s" -> "%s"\n' % (src_basename, norm_basename)) # os.rename overwrites a file if the file exists. os.rename(os.path.join(src_dir, src_basename), os.path.join(src_dir, norm_basename)) if __name__ == '__main__': # Run the following command to run doctests. # $ python -m doctest -v normalize_name.py _main()
Set permission to run
$ chmod +x normalize_name.py
Tests
$ python -m doctest -v normalize_name.py
Commands
# Set a target directory. $ TARGET_DIR='/home/shinya/example' $ find "${TARGET_DIR}" -type f -exec ./normalize_name.py {} \;
Example
$ TARGET_DIR='/home/shinya/example' # Before running the command. $ tree "${TARGET_DIR}" /home/shinya/example ├── image①.jpg └── subdir └── image file 001 .jpg $ find "${TARGET_DIR}" -type f -exec ./normalize_name.py {} \; " image file 001 .jpg" -> "image file 001.jpg" "image①.jpg" -> "image1.jpg" # After running the command. $ tree "${TARGET_DIR}" /home/shinya/example ├── image1.jpg └── subdir └── image file 001.jpg