Normalize file names

Abstract

Normalize file names on Ubuntu 12.04.

The following script and command:

  • Normalize unicode characters. E.g. image①.jpg -> image1.jpg.
  • Make the extension of a basename lower case. E.g. image.JPG -> image.jpg
  • Strip the root of a basename. E.g. " image .JPG" -> image.jpg
  • Replace multiple spaces with a single space. E.g. "image file 001.jpg" -> "image file 001.jpg"

Code

normalize_name.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 Shinya
# The MIT License (MIT) http://opensource.org/licenses/mit-license.php

import os
import re
import sys
import unicodedata


def _unicode_to_str(text):
    if isinstance(text, unicode):
        return text.encode('utf-8')
    return text


def _str_to_unicode(text):
    if isinstance(text, str):
        return text.decode('utf-8')
    return text


def _normalize_multiple_spaces(name):
    """Replaces multiple spaces with a single space.

    >>> _normalize_multiple_spaces('\t name    having \t multiple spaces')
    ' name having multiple spaces'
    """
    return re.sub('\s+', ' ', name)


def _remove_special_characters(name):
    """Removes special characters like a slash.

    >>> _remove_special_characters('name having a slash / character')
    'name having a slash  character'
    """
    return name.replace(os.path.sep, '')


def _normalize_basename(basename):
    """Normalizes a basename.

    Replaces multiple spaces with a single space.
    >>> _normalize_basename('name  having\t multiple spaces')
    'name having multiple spaces'

    Strips heading and tailing spaces.
    >>> _normalize_basename('  name starting and ending with spaces   ')
    'name starting and ending with spaces'

    Make the extension of a basename lower cases.
    >>> _normalize_basename('name.JPG')
    'name.jpg'

    Strips the root of a basename.
    >>> _normalize_basename('  name  .jpg')
    'name.jpg'

    Normalizes unicode characters. E.g. ① -> 1
    >>> _normalize_basename('name①.jpg')
    'name1.jpg'
    """
    norm_basename = _str_to_unicode(basename)
    norm_basename = unicodedata.normalize('NFKC', norm_basename)
    norm_basename = _remove_special_characters(norm_basename)
    norm_basename = os.path.normcase(norm_basename)
    norm_basename = _normalize_multiple_spaces(norm_basename)
    (root, ext) = os.path.splitext(norm_basename)
    return _unicode_to_str(root.strip() + ext.strip().lower())


def _main():
    if len(sys.argv) != 2:
        sys.exit('Usage: $ ./normalize_name.py <target_file>')
    src_path = sys.argv[1]
    (src_dir, src_basename) = os.path.split(os.path.normpath(src_path))
    norm_basename = _normalize_basename(src_basename)
    if src_basename != norm_basename:
        sys.stdout.write('"%s" -> "%s"\n' % (src_basename, norm_basename))
        # os.rename overwrites a file if the file exists.
        os.rename(os.path.join(src_dir, src_basename),
                  os.path.join(src_dir, norm_basename))


if __name__ == '__main__':
    # Run the following command to run doctests.
    # $ python -m doctest -v normalize_name.py
    _main()
Set permission to run
$ chmod +x normalize_name.py
Tests
$ python -m doctest -v normalize_name.py

Commands

# Set a target directory.
$ TARGET_DIR='/home/shinya/example'
$ find "${TARGET_DIR}" -type f -exec ./normalize_name.py {} \;

Example

$ TARGET_DIR='/home/shinya/example'


# Before running the command.
$ tree "${TARGET_DIR}"
/home/shinya/example
├── image①.jpg
└── subdir
    └──    image  file  001 .jpg


$ find "${TARGET_DIR}" -type f -exec ./normalize_name.py {} \;
"   image  file  001 .jpg" -> "image file 001.jpg"
"image.jpg" -> "image1.jpg"


# After running the command.
$ tree "${TARGET_DIR}"
/home/shinya/example
├── image1.jpg
└── subdir
    └── image file 001.jpg