#!perl use strict; use warnings; use open qw( :encoding(UTF-8) :std ); use English qw( -no_match_vars ); use File::Glob qw( bsd_glob ); @ARGV or die "Usage: perl $PROGRAM_NAME file ...\n"; local @ARGV = map { bsd_glob($ARG) } @ARGV; local $INPLACE_EDIT = '.bak'; # See http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT my %mojibake_replace = ( "\x{00E2}\x{201A}\x{00AC}" => "\x{20AC}", # 0x80 EURO SIGN "\x{00C2}\x{0081}" => "\x{0081}", # 0x81 UNDEFINED "\x{00E2}\x{20AC}\x{0161}" => "\x{201A}", # 0x82 SINGLE LOW-9 QUOTATION MARK "\x{00C6}\x{2019}" => "\x{0192}", # 0x83 LATIN SMALL LETTER F WITH HOOK "\x{00E2}\x{20AC}\x{017E}" => "\x{201E}", # 0x84 DOUBLE LOW-9 QUOTATION MARK "\x{00E2}\x{20AC}\x{00A6}" => "\x{2026}", # 0x85 HORIZONTAL ELLIPSIS "\x{00E2}\x{20AC}\x{00A0}" => "\x{2020}", # 0x86 DAGGER "\x{00E2}\x{20AC}\x{00A1}" => "\x{2021}", # 0x87 DOUBLE DAGGER "\x{00CB}\x{2020}" => "\x{02C6}", # 0x88 MODIFIER LETTER CIRCUMFLEX ACCENT "\x{00E2}\x{20AC}\x{00B0}" => "\x{2030}", # 0x89 PER MILLE SIGN "\x{00C5}\x{00A0}" => "\x{0160}", # 0x8A LATIN CAPITAL LETTER S WITH CARON "\x{00E2}\x{20AC}\x{00B9}" => "\x{2039}", # 0x8B SINGLE LEFT-POINTING ANGLE QUOTATION MARK "\x{00C5}\x{2019}" => "\x{0152}", # 0x8C LATIN CAPITAL LIGATURE OE "\x{00C2}\x{008D}" => "\x{008D}", # 0x8D UNDEFINED "\x{00C5}\x{00BD}" => "\x{017D}", # 0x8E LATIN CAPITAL LETTER Z WITH CARON "\x{00C2}\x{008F}" => "\x{008F}", # 0x8F UNDEFINED "\x{00C2}\x{0090}" => "\x{0090}", # 0x90 UNDEFINED "\x{00E2}\x{20AC}\x{02DC}" => "\x{2018}", # 0x91 LEFT SINGLE QUOTATION MARK "\x{00E2}\x{20AC}\x{2122}" => "\x{2019}", # 0x92 RIGHT SINGLE QUOTATION MARK "\x{00E2}\x{20AC}\x{0153}" => "\x{201C}", # 0x93 LEFT DOUBLE QUOTATION MARK "\x{00E2}\x{20AC}\x{009D}" => "\x{201D}", # 0x94 RIGHT DOUBLE QUOTATION MARK "\x{00E2}\x{20AC}\x{00A2}" => "\x{2022}", # 0x95 BULLET "\x{00E2}\x{20AC}\x{201C}" => "\x{2013}", # 0x96 EN DASH "\x{00E2}\x{20AC}\x{201D}" => "\x{2014}", # 0x97 EM DASH "\x{00CB}\x{0153}" => "\x{02DC}", # 0x98 SMALL TILDE "\x{00E2}\x{201E}\x{00A2}" => "\x{2122}", # 0x99 TRADE MARK SIGN "\x{00C5}\x{00A1}" => "\x{0161}", # 0x9A LATIN SMALL LETTER S WITH CARON "\x{00E2}\x{20AC}\x{00BA}" => "\x{203A}", # 0x9B SINGLE RIGHT-POINTING ANGLE QUOTATION MARK "\x{00C5}\x{201C}" => "\x{0153}", # 0x9C LATIN SMALL LIGATURE OE "\x{00C2}\x{009D}" => "\x{009D}", # 0x9D UNDEFINED "\x{00C5}\x{00BE}" => "\x{017E}", # 0x9E LATIN SMALL LETTER Z WITH CARON "\x{00C5}\x{00B8}" => "\x{0178}", # 0x9F LATIN CAPITAL LETTER Y WITH DIAERESIS ); my $mojibake_regex = qr{ ( \x{00C2}[\x{0081}\x{008D}\x{008F}\x{0090}\x{009D}] | \x{00C5}[\x{00A0}\x{00A1}\x{00B8}\x{00BD}\x{00BE}\x{2019}\x{201C}] | \x{00C6}\x{2019} | \x{00CB}[\x{0153}\x{2020}] | \x{00E2}\x{20AC}[\x{009D}\x{00A0}\x{00A1}\x{00A2}\x{00A6}\x{00B0}\x{00B9}\x{00BA}\x{0153}\x{0161}\x{017E}\x{02DC}\x{201C}\x{201D}\x{2122}] | \x{00E2}\x{201A}\x{00AC} | \x{00E2}\x{201E}\x{00A2} ) }x; while () { s/$mojibake_regex/$mojibake_replace{$1}/g; print; } exit 0;