# Problem was: I want no 'ä' in the database (which is a utf8 database). # Solution was HTML::Entities qw/decode_entities/; #!/bin/sh time zcat ~/dl/dblp.uni-trier.de/xml/dblp.xml.gz \ | perl -MEncode -MHTML::Entities -ne ' if( m/^([^\n]*)<.title>/ ) { my $title = $1; $title =~ s{\\}{}g; next if ($title eq "Home Page" || $title eq "Editorial." || $title eq "Preface." || $title eq "Introduction." || $title eq "Foreword." || $title eq "Guest Editorial." || $title eq "Book Reviews." ); print encode("UTF8", (decode_entities($title) . "\n" ), Encode::FB_CROAK); }' \ | psql -c " drop table if exists dblp; create table dblp (title text); copy dblp from stdin; " ; echo "select count(*) from dblp " | psql ; echo "select * from dblp where position ('&' in title) > 0 limit 40" | psql ;