#!perl -w # Written 2007 Chad Redman # Free for any use package MySAXHandler; # use Encode; # use utf8; use base qw(XML::SAX::Base); sub new { my $type = shift; return bless { 'content' => { 'texts' => [], 'files' => [], 'words' => [], 'full_sentences' => [], }, }, $type; } my $current_element = ''; my $current_textid = ''; my $current_fileid = ''; my $ct_para = 0; my $sentence_full = undef; my $current_sentence_id = ''; my $ct_word = 0; my $buf_word = undef; my $current_speech = undef; sub start_element { my ($self, $el) = @_; $current_element = $el->{Name}; if ($el->{Name} eq 'text') { $current_textid = $el->{Attributes}->{'{}ID'}{Value}; push @{$self->{content}{texts}}, [ $el->{Attributes}->{'{}ID'}{Value}, $el->{Attributes}->{'{}TYPE'}{Value}]; } elsif ($el->{Name} eq 'file') { $current_fileid = $el->{Attributes}->{'{}ID'}{Value}; push @{$self->{content}{files}}, [$el->{Attributes}->{'{}ID'}{Value}, $current_textid]; } elsif ($el->{Name} eq 'p') { ++$ct_para; } elsif ($el->{Name} eq 's') { $ct_word = 0; $sentence_full = ''; $current_sentence_id = $el->{Attributes}->{'{}n'}{Value}; } elsif ($el->{Name} eq 'w') { $buf_word = ''; $current_speech = $el->{Attributes}->{'{}POS'}{Value}; ++$ct_word; } elsif ($el->{Name} eq 'c') { $buf_word = ''; $current_speech = $el->{Attributes}->{'{}POS'}{Value}; ++$ct_word; } } sub end_element { my ($self, $el) = @_; if ($el->{Name} eq 'text') { $current_textid = undef; } if ($el->{Name} eq 'file') { $current_fileid = undef; $ct_para = 0; } elsif ($el->{Name} eq 'p') { } elsif ($el->{Name} eq 's') { push @{$self->{content}{full_sentences}}, [$current_textid, $current_fileid, $current_sentence_id, $ct_para, $sentence_full]; $sentence_full = undef; $current_sentence_id = ''; $ct_word = 0; } elsif ($el->{Name} eq 'w') { #print 'utf8: ', utf8::is_utf8( encode("utf8", $string)); exit 0; #print 'utf8: ', utf8::is_utf8($buf_word); exit 0; #$buf_word =~ s/'/''/g; # this shouldn't be necessary, but there are some places (section F) where it slips through push @{$self->{content}{words}}, [$current_textid, $current_fileid, $current_sentence_id, $ct_word, $buf_word, $current_speech, $el->{Name}, $ct_para]; $sentence_full .= $buf_word; $buf_word = undef; $current_speech = undef; } elsif ($el->{Name} eq 'c') { #$buf_word =~ s/'/''/g; push @{$self->{content}{words}}, [$current_textid, $current_fileid, $current_sentence_id, $ct_word, $buf_word, $current_speech, $el->{Name}, $ct_para]; $sentence_full .= $buf_word; $buf_word = undef; $current_speech = undef; } } sub characters { my ($self, $chars) = @_; if ($current_element =~ /^(c|w)$/) { $buf_word .= $chars->{Data}; } } 1; package main; use XML::SAX; use DBI; #binmode STDOUT, ":utf8"; my $dir = 'G:/Home/Chinese/corpora/LCMC/character'; my $dbh = DBI->connect("dbi:SQLite:dbname=G:/Home/Chinese/devel/corpus/lcmc","","") or die $!; $dbh->{unicode} = 1; $dbh->{AutoCommit} = 0; foreach my $xmlfile (<$dir/LCMC_?.XML>) { my $handler = MySAXHandler->new; my $parser = XML::SAX::ParserFactory->parser(Handler => $handler ); $parser->parse_uri($xmlfile); #$parser->parse_uri('G:/Home/Chinese/corpora/LCMC/character/LCMC_R.XML'); my $content = $handler->{content}; eval { write_object($dbh, $content, 'texts', 'files', 'words', 'full_sentences'); $dbh->commit; # commit the changes if we get this far }; if ($@) { print "Transaction aborted because $@"; eval { $dbh->rollback }; } print "Finished $xmlfile\n"; } exit 0; sub write_object { my ($dbh, $content, @targets) = @_; my %sql = ( texts => "?, ?", files => "?, ?", words => "?, ?, ?, ?, ?, ?, ?, ?", full_sentences => "?, ?, ?, ?, ?", ); foreach my $target (@targets) { my $sth = $dbh->prepare("INSERT INTO $target VALUES ($sql{$target})"); foreach my $obj (@{$content->{$target}}) { my $rows_affected = $sth->execute(@$obj); if ($rows_affected != 1) { print "*Error: $target: $rows_affected, ", $dbh->errstr, "\n"; } } } } sub dump_object { my ($content, $target) = @_; open(OUT, "$target.txt") or die $!; #open(OUT, ">:utf8", "$target.txt") or die $!; foreach my $obj (@{$content->{$target}}) { #use Data::Dumper; print Dumper $obj; exit 0; print OUT join("\t", @$obj), "\n"; } close OUT; } __END__ CREATE DATABASE lcmc_corpus COLLATE SQL_Latin1_General_CP1_CI_AS GO USE lcmc_corpus GO CREATE TABLE texts ( id VARCHAR(2) NOT NULL, type VARCHAR(50), CONSTRAINT pk_texts PRIMARY KEY (id) ) GO CREATE TABLE files ( id VARCHAR(3) NOT NULL, text_id VARCHAR(2) NOT NULL, CONSTRAINT pk_files PRIMARY KEY (id), CONSTRAINT fk_texts01 FOREIGN KEY (text_id) REFERENCES texts(id) ) GO CREATE TABLE words ( text_id VARCHAR(2) NOT NULL, file_id VARCHAR(3) NOT NULL, sentence_id VARCHAR(5) NOT NULL, word_num INTEGER, characters NVARCHAR(16), part_of_speech VARCHAR(3), token_type CHAR(1), paragraph_num INTEGER, CONSTRAINT pk_words PRIMARY KEY (text_id, file_id, sentence_id, word_num), CONSTRAINT fk_texts02 FOREIGN KEY (text_id) REFERENCES texts(id), CONSTRAINT fk_filess01 FOREIGN KEY (file_id) REFERENCES files(id) ) GO CREATE TABLE full_sentences ( text_id VARCHAR(2) NOT NULL, file_id VARCHAR(3) NOT NULL, sentence_id VARCHAR(5) NOT NULL, paragraph_num INTEGER, characters NVARCHAR(500), CONSTRAINT pk_full_sentences PRIMARY KEY (text_id, file_id, sentence_id), CONSTRAINT fk_texts03 FOREIGN KEY (text_id) REFERENCES texts(id), CONSTRAINT fk_filess02 FOREIGN KEY (file_id) REFERENCES files(id) ) GO /***********/ SELECT TOP 100 A.*, B.characters, C.characters, B.part_of_speech, C.part_of_speech FROM words A join words B on A.file_id = B.file_id and A.sentence_id = B.sentence_id and A.word_num + 1 = B.word_num join words C on A.file_id = C.file_id and A.sentence_id = C.sentence_id and A.word_num + 2 = C.word_num WHERE A.part_of_speech = 'p' /* sample - how to get random words */ select TOP 100 W.file_id, T.type, W.characters, S.characters from words W , full_sentences S, texts T where W.file_id = S.file_id and W.sentence_id = S.sentence_id AND W.text_id = T.id and W.part_of_speech = 'p' and len(S.characters) between 10 and 24 and W.characters NOT IN (N'?') order by NEWID() /* find sentences containing a specific token */ select W.file_id, T.type, W.characters, W.part_of_speech, S.characters from words W join full_sentences S on W.file_id = S.file_id and W.sentence_id = S.sentence_id join texts T on S.text_id = T.id where W.characters like N'??' and len(S.characters) < 25