Hi all. I have some Twitter data in Cyrillic stored in a database. They show up as Cyrillic chars in pgAdmin. I am trying to extract some records and save them to a .json file, like so:
#!/usr/bin/perl -w
use JSON::XS;
use Lingua::Identify qw(:language_identification);
#use lib qw(/home/corman/perlmodules);
#use SqlSupport;
my $dbh = connectpgdb('****','****','****','Pg','localhost');
my @items = getsqlcol($dbh,"select tweet_text from twitter order by ra
+ndom() limit 10000");
my $sample;
my $idx = 0;
foreach my $tweet (@items) {
my $lang = langof($tweet);
if ($lang =~ /ru|bg|uk/) {
$sample->[$idx]->{text} = $tweet;
$sample->[$idx]->{lang} = $lang;
$sample->[$idx]->{len} =length($tweet);
$idx++;
}
}
print "$idx items\n";
open(OUT,">twitter-non-en.json") or die "Can't open output: $!";
binmode OUT, ':utf8';
print OUT encode_json($sample);
close OUT;
sub connectpgdb { # this is used to connect with DBD::Pg
my ($database,$user,$password,$driver,$server) = @_;
my $url = "DBI:$driver:dbname=$database;host=$server;port=5432";
my $dbh = DBI->connect( $url, $user, $password,{AutoCommit=>1,Rais
+eError=>1,PrintError=>0}) or die "connectdb can't connect to psql: $!
+\n";
return $dbh;
}
sub getsqlcol {
my ($dbh,$sqlstatement)= @_;
my @results = ();
my $sth = $dbh->prepare($sqlstatement);
my @col;
$sth->execute || die "Could not execute MySQL statement: $sqlstate
+ment";
while (@col=$sth->fetchrow_array) { push(@results,$col[0]); }
return @results;
}
When I open the resulting .json in Firefox to inspect, the text fields are not Cyrillic but look like this:
УдаÑ\u…\u0081номÑ\u0083. I have this problem not just with JSON::XS but when saving to .txt files, etc. What am I doing wrong?