#!perl -w

# Written 2007 Chad Redman
# Free for any use


package MySAXHandler;

#  use Encode;
#  use utf8;
  
  
use base qw(XML::SAX::Base);

sub new {
    my $type = shift;
    return bless {
           'content' => {
                   'texts' => [],
                   'files' => [],
                   'words' => [],
                   'full_sentences' => [],
           },
    }, $type;
}

my $current_element = '';

my $current_textid = '';
my $current_fileid = '';
my $ct_para = 0;
my $sentence_full = undef;
my $current_sentence_id = '';
my $ct_word = 0;
my $buf_word = undef;
my $current_speech = undef;

sub start_element {
    my ($self, $el) = @_;
    $current_element = $el->{Name};
    if ($el->{Name} eq 'text') {
        $current_textid = $el->{Attributes}->{'{}ID'}{Value};
        push @{$self->{content}{texts}}, [ $el->{Attributes}->{'{}ID'}{Value}, $el->{Attributes}->{'{}TYPE'}{Value}];
    }
    elsif ($el->{Name} eq 'file') {
        $current_fileid = $el->{Attributes}->{'{}ID'}{Value};
        push @{$self->{content}{files}}, [$el->{Attributes}->{'{}ID'}{Value}, $current_textid];
    }
    elsif ($el->{Name} eq 'p') {
        ++$ct_para;
    }
    elsif ($el->{Name} eq 's') {
        $ct_word = 0;
        $sentence_full = '';
        $current_sentence_id = $el->{Attributes}->{'{}n'}{Value};
    }
    elsif ($el->{Name} eq 'w') {
        $buf_word = '';
        $current_speech = $el->{Attributes}->{'{}POS'}{Value};
        ++$ct_word;
    }
    elsif ($el->{Name} eq 'c') {
        $buf_word = '';
        $current_speech = $el->{Attributes}->{'{}POS'}{Value};
        ++$ct_word;
    }
}

sub end_element {
    my ($self, $el) = @_;

    if ($el->{Name} eq 'text') {
        $current_textid = undef;
    }
    if ($el->{Name} eq 'file') {
        $current_fileid = undef;
        $ct_para = 0;
    }
    elsif ($el->{Name} eq 'p') {
    }
    elsif ($el->{Name} eq 's') {
        push @{$self->{content}{full_sentences}}, [$current_textid, $current_fileid, $current_sentence_id, $ct_para, $sentence_full];
        $sentence_full = undef;
        $current_sentence_id = '';
        $ct_word = 0;
    }
    elsif ($el->{Name} eq 'w') {
        #print 'utf8: ', utf8::is_utf8( encode("utf8", $string)); exit 0;
        #print 'utf8: ', utf8::is_utf8($buf_word); exit 0;
        #$buf_word =~ s/'/''/g;   # this shouldn't be necessary, but there are some places (section F) where it slips through
        push @{$self->{content}{words}}, [$current_textid, $current_fileid, $current_sentence_id, $ct_word, $buf_word, $current_speech, $el->{Name}, $ct_para];
        $sentence_full .= $buf_word;
        $buf_word = undef;
        $current_speech = undef;
    }
    elsif ($el->{Name} eq 'c') {
        #$buf_word =~ s/'/''/g;
        push @{$self->{content}{words}}, [$current_textid, $current_fileid, $current_sentence_id, $ct_word, $buf_word, $current_speech, $el->{Name}, $ct_para];
        $sentence_full .= $buf_word;
        $buf_word = undef;
        $current_speech = undef;
    }
}

sub characters {
    my ($self, $chars) = @_;

    if ($current_element =~ /^(c|w)$/) {
        $buf_word .= $chars->{Data};
    }
}

1;

package main;

use XML::SAX;
use DBI;

#binmode STDOUT, ":utf8";

my $dir = 'G:/Home/Chinese/corpora/LCMC/character';

my $dbh = DBI->connect("dbi:SQLite:dbname=G:/Home/Chinese/devel/corpus/lcmc","","") or die $!;
$dbh->{unicode} = 1;
$dbh->{AutoCommit} = 0;

foreach my $xmlfile (<$dir/LCMC_?.XML>) {
    my $handler = MySAXHandler->new;
    my $parser = XML::SAX::ParserFactory->parser(Handler => $handler );

    $parser->parse_uri($xmlfile);
    #$parser->parse_uri('G:/Home/Chinese/corpora/LCMC/character/LCMC_R.XML');

    my $content = $handler->{content};

    eval {
        write_object($dbh, $content, 'texts', 'files', 'words', 'full_sentences');
        $dbh->commit;   # commit the changes if we get this far
    };
    if ($@) {
        print  "Transaction aborted because $@";
        eval { $dbh->rollback };
    }
    
    print "Finished $xmlfile\n";

}


exit 0;


sub write_object {
    my ($dbh, $content, @targets) = @_;

    my %sql = (
        texts => "?, ?",
        files => "?, ?",
        words => "?, ?, ?, ?, ?, ?, ?, ?",
        full_sentences => "?, ?, ?, ?, ?",
    );
        
    foreach my $target (@targets) {
        my $sth = $dbh->prepare("INSERT INTO $target VALUES ($sql{$target})");
        foreach my $obj (@{$content->{$target}}) {
            my $rows_affected = $sth->execute(@$obj);
            if ($rows_affected != 1) {
                print "*Error: $target: $rows_affected, ", $dbh->errstr, "\n";
            }
        }
    }
}


sub dump_object {
    my ($content, $target) = @_;
    open(OUT, "$target.txt") or die $!;
    #open(OUT, ">:utf8", "$target.txt") or die $!;

    foreach my $obj (@{$content->{$target}}) {
        #use Data::Dumper; print Dumper $obj; exit 0;
        print OUT join("\t", @$obj), "\n";
    }
    
    close OUT;
}


__END__

CREATE DATABASE lcmc_corpus
COLLATE SQL_Latin1_General_CP1_CI_AS
GO


USE lcmc_corpus
GO

CREATE TABLE texts (
    id   VARCHAR(2) NOT NULL,
    type VARCHAR(50),
    CONSTRAINT pk_texts PRIMARY KEY (id)
)
GO

CREATE TABLE files (
    id       VARCHAR(3) NOT NULL,
    text_id  VARCHAR(2) NOT NULL,
    CONSTRAINT pk_files PRIMARY KEY (id),
    CONSTRAINT fk_texts01 FOREIGN KEY (text_id) REFERENCES texts(id)
)
GO

CREATE TABLE words (
    text_id  VARCHAR(2) NOT NULL,
    file_id  VARCHAR(3) NOT NULL,
    sentence_id  VARCHAR(5) NOT NULL,
    word_num     INTEGER,
    characters  NVARCHAR(16),
    part_of_speech  VARCHAR(3),
    token_type      CHAR(1),
    paragraph_num  INTEGER,
    CONSTRAINT pk_words PRIMARY KEY (text_id, file_id, sentence_id, word_num),
    CONSTRAINT fk_texts02 FOREIGN KEY (text_id) REFERENCES texts(id),
    CONSTRAINT fk_filess01 FOREIGN KEY (file_id) REFERENCES files(id)
)
GO

CREATE TABLE full_sentences (
    text_id  VARCHAR(2) NOT NULL,
    file_id  VARCHAR(3) NOT NULL,
    sentence_id  VARCHAR(5) NOT NULL,
    paragraph_num  INTEGER,
    characters  NVARCHAR(500),
    CONSTRAINT pk_full_sentences PRIMARY KEY (text_id, file_id, sentence_id),
    CONSTRAINT fk_texts03 FOREIGN KEY (text_id) REFERENCES texts(id),
    CONSTRAINT fk_filess02 FOREIGN KEY (file_id) REFERENCES files(id)
)
GO


/***********/

SELECT TOP 100 A.*, B.characters, C.characters, B.part_of_speech, C.part_of_speech
FROM words A
join words B on A.file_id = B.file_id and A.sentence_id = B.sentence_id and A.word_num + 1 = B.word_num
join words C on A.file_id = C.file_id and A.sentence_id = C.sentence_id and A.word_num + 2 = C.word_num
WHERE A.part_of_speech = 'p'

/* sample - how to get random words */
select TOP 100 W.file_id, T.type, W.characters, S.characters from words W , full_sentences S, texts T
where W.file_id = S.file_id and W.sentence_id = S.sentence_id
AND W.text_id = T.id
and W.part_of_speech = 'p' and len(S.characters) between 10 and 24
and W.characters NOT IN (N'?')
order by NEWID()


/* find sentences containing a specific token */
select W.file_id, T.type, W.characters, W.part_of_speech, S.characters
from words W
join full_sentences S on W.file_id = S.file_id and W.sentence_id = S.sentence_id
join texts T on S.text_id = T.id
where W.characters like N'??' and len(S.characters) < 25