"SfR Fresh" - the SfR Freeware/Shareware Archive 
Member "odt2txt-0.4/odt2txt.c" of archive odt2txt-0.4.tar.gz:
As a special service "SfR Fresh" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting with prefixed line numbers.
Alternatively you can here view or download the uninterpreted source code file.
That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
1 /*
2 * odt2txt.c: A simple (and stupid) converter from OpenDocument Text
3 * to plain text.
4 *
5 * Copyright (c) 2006-2008 Dennis Stosberg <dennis@stosberg.net>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License,
9 * version 2 as published by the Free Software Foundation
10 */
11
12 #include <sys/stat.h>
13 #include <sys/types.h>
14
15 #include <errno.h>
16 #include <fcntl.h>
17 #ifdef NO_ICONV
18 # define iconv_t int
19 #else
20 # include <iconv.h>
21 # ifdef WIN32
22 # include <windows.h>
23 # else
24 # include <langinfo.h>
25 # endif
26 #endif
27
28 #include <limits.h>
29 #include <locale.h>
30 #include <stddef.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <unistd.h>
35
36 #include "mem.h"
37 #include "regex.h"
38 #include "strbuf.h"
39 #include "kunzip/kunzip.h"
40
41 #define VERSION "0.4"
42
43 static int opt_raw;
44 static char *opt_encoding;
45 static int opt_width = 63;
46 static const char *opt_filename;
47 static char *opt_output;
48
49 #define SUBST_NONE 0
50 #define SUBST_SOME 1
51 #define SUBST_ALL 2
52
53 static int opt_subst = SUBST_SOME;
54
55 #ifndef ICONV_CHAR
56 #define ICONV_CHAR char
57 #endif
58
59 #ifdef iconvlist
60 static void show_iconvlist();
61 #endif
62
63 #define RS_O(a,b) (void)regex_subst(buf, (a), _REG_DEFAULT, (b))
64 #define RS_G(a,b) (void)regex_subst(buf, (a), _REG_GLOBAL, (b))
65 #define RS_E(a,b) (void)regex_subst(buf, (a), _REG_EXEC | _REG_GLOBAL, (void*)(b))
66
67 static char *guess_encoding(void);
68 static void write_to_file(STRBUF *outbuf, const char *filename);
69
70 struct subst {
71 int unicode;
72 const char *utf8;
73 const char *ascii;
74 };
75
76 static struct subst substs[] = {
77 /* number, UTF-8 sequence, ascii substitution */
78 { 0x00A0, "\xC2\xA0", " " }, /* no-break space */
79 { 0x00A9, "\xC2\xA9", "(c)" }, /* copyright sign */
80 { 0x00AB, "\xC2\xAB", "<<" }, /* left double angle quote */
81 { 0x00AD, "\xC2\xAD", "-" }, /* soft hyphen */
82 { 0x00AE, "\xC2\xAE", "(r)" }, /* registered sign */
83 { 0x00BB, "\xC2\xBB", ">>" }, /* right double angle quote */
84
85 { 0x00BC, "\xC2\xBC", "1/4" }, /* one quarter */
86 { 0x00BD, "\xC2\xBD", "1/2" }, /* one half */
87 { 0x00BE, "\xC2\xBE", "3/4" }, /* three quarters */
88
89 { 0x00C4, "\xC3\x84", "Ae" }, /* german umlaut A */
90 { 0x00D6, "\xC3\x96", "Oe" }, /* german umlaut O */
91 { 0x00DC, "\xC3\x9C", "Ue" }, /* german umlaut U */
92 { 0x00DF, "\xC3\x9F", "ss" }, /* german sharp s */
93 { 0x00E4, "\xC3\xA4", "ae" }, /* german umlaut a */
94 { 0x00F6, "\xC3\xB6", "oe" }, /* german umlaut o */
95 { 0x00FC, "\xC3\xBC", "ue" }, /* german umlaut u */
96
97 { 0x2010, "\xE2\x80\x90", "-" }, /* hyphen */
98 { 0x2011, "\xE2\x80\x91", "-" }, /* non-breaking hyphen */
99 { 0x2012, "\xE2\x80\x92", "-" }, /* figure dash */
100 { 0x2013, "\xE2\x80\x93", "-" }, /* en dash */
101 { 0x2014, "\xE2\x80\x94", "--" }, /* em dash */
102 { 0x2015, "\xE2\x80\x95", "--" }, /* quotation dash */
103
104 { 0x2018, "\xE2\x80\x98", "`" }, /* single left quotation mark */
105 { 0x2019, "\xE2\x80\x99", "'" }, /* single right quotation mark */
106 { 0x201A, "\xE2\x80\x9A", "," }, /* german single right quotation mark */
107 { 0x201B, "\xE2\x80\x9B", "`" }, /* reversed right quotation mark */
108 { 0x201C, "\xE2\x80\x9C", "``" }, /* left quotation mark */
109 { 0x201D, "\xE2\x80\x9D", "''" }, /* right quotation mark */
110 { 0x201E, "\xE2\x80\x9E", ",," }, /* german left quotes */
111
112 { 0x2022, "\xE2\x80\xA2", "o " }, /* bullet */
113 { 0x2022, "\xE2\x80\xA3", "< " }, /* triangle bullet */
114
115 { 0x2025, "\xE2\x80\xA5", ".." }, /* double dot */
116 { 0x2026, "\xE2\x80\xA6", "..." }, /* ellipsis */
117
118 { 0x2030, "\xE2\x80\xB0", "o/oo" }, /* per mille */
119 { 0x2039, "\xE2\x80\xB9", "<" }, /* left single angle quote */
120 { 0x203A, "\xE2\x80\xBA", ">" }, /* right single angle quote */
121
122 { 0x20AC, "\xE2\x82\xAC", "EUR" }, /* euro currency symbol */
123
124 { 0x2190, "\xE2\x86\x90", "<-" }, /* left arrow */
125 { 0x2192, "\xE2\x86\x92", "->" }, /* right arrow */
126 { 0x2194, "\xE2\x86\x94", "<->"}, /* left right arrow */
127
128 { 0, NULL, NULL },
129 };
130
131 static void usage(void)
132 {
133 printf("odt2txt %s\n"
134 "Converts an OpenDocument or OpenOffice.org XML File to raw text.\n\n"
135 "Syntax: odt2txt [options] filename\n\n"
136 "Options: --raw Print raw XML\n"
137 #ifdef NO_ICONV
138 " --encoding=X Ignored. odt2txt has been built without iconv support.\n"
139 " Output will always be encoded in UTF-8\n"
140 #else
141 " --encoding=X Do not try to autodetect the terminal encoding, but\n"
142 " convert the document to encoding X unconditionally\n"
143 # ifdef iconvlist
144 " You can list all supported encodings by specifying\n"
145 " --encoding=list\n"
146 # endif
147 " To find out, which terminal encoding will be used in\n"
148 " auto mode, use --encoding=show\n"
149 #endif
150 " --width=X Wrap text lines after X characters. Default: 65.\n"
151 " If set to -1 then no lines will be broken\n"
152 " --output=file Write output to file, instead of STDOUT\n"
153 " --subst=X Select which non-ascii characters shall be replaced\n"
154 " by ascii look-a-likes:\n"
155 " --subst=all Substitute all characters for which\n"
156 " substitutions are known\n"
157 " --subst=some Substitute all characters which the\n"
158 " output charset does not contain\n"
159 " This is the default\n"
160 " --subst=none Substitute no characters\n"
161 " --version Show version and copyright information\n",
162 VERSION);
163 exit(EXIT_FAILURE);
164 }
165
166 static void version_info(void)
167 {
168 printf("odt2txt %s\n"
169 "Copyright (c) 2006,2007 Dennis Stosberg <dennis@stosberg.net>\n"
170 "Uses the kunzip library, Copyright 2005,2006 by Michael Kohn\n"
171 "\n"
172 "This program is free software; you can redistribute it and/or\n"
173 "modify it under the terms of the GNU General Public License,\n"
174 "version 2 as published by the Free Software Foundation\n"
175 "\n"
176 "Homepage: http://stosberg.net/odt2txt/\n",
177 VERSION);
178 exit(EXIT_SUCCESS);
179 }
180
181 static void yrealloc_buf(char **buf, char **mark, size_t len) {
182 ptrdiff_t offset = *mark - *buf;
183 *buf = yrealloc(*buf, len);
184 *mark = *buf + offset;
185 }
186
187 #ifdef NO_ICONV
188
189 static void finish_conv(iconv_t ic)
190 {
191 return;
192 }
193
194 static iconv_t init_conv(const char *input_enc, const char *output_enc)
195 {
196 return 0;
197 }
198
199 static STRBUF *conv(iconv_t ic, STRBUF *buf) {
200 STRBUF *output;
201
202 output = strbuf_new();
203 strbuf_append_n(output, strbuf_get(buf), strbuf_len(buf));
204
205 return output;
206 }
207
208 static void subst_doc(iconv_t ic, STRBUF *buf) {
209 return;
210 }
211
212 static char *guess_encoding(void)
213 {
214 return NULL;
215 }
216
217 #else
218
219 static iconv_t init_conv(const char *input_enc, const char *output_enc)
220 {
221 iconv_t ic;
222 ic = iconv_open(output_enc, input_enc);
223 if (ic == (iconv_t)-1) {
224 if (errno == EINVAL) {
225 fprintf(stderr, "warning: Conversion from %s to %s is not supported.\n",
226 input_enc, opt_encoding);
227 ic = iconv_open("us-ascii", input_enc);
228 if (ic == (iconv_t)-1) {
229 exit(EXIT_FAILURE);
230 }
231 fprintf(stderr, "warning: Using us-ascii as fall-back.\n");
232 } else {
233 fprintf(stderr, "iconv_open returned: %s\n", strerror(errno));
234 exit(EXIT_FAILURE);
235 }
236 }
237 return ic;
238 }
239
240 static void finish_conv(iconv_t ic)
241 {
242 if(iconv_close(ic) == -1) {
243 fprintf(stderr, "iconv_close returned: %s\n", strerror(errno));
244 exit(EXIT_FAILURE);
245 }
246 }
247
248 static STRBUF *conv(iconv_t ic, STRBUF *buf)
249 {
250 /* FIXME: This functionality belongs into strbuf.c */
251 ICONV_CHAR *doc;
252 char *out, *outbuf;
253 size_t inleft, outleft = 0;
254 size_t r;
255 size_t outlen = 0;
256 const size_t alloc_step = 4096;
257 STRBUF *output;
258
259 inleft = strbuf_len(buf);
260 doc = (ICONV_CHAR*)strbuf_get(buf);
261 outlen = alloc_step; outleft = alloc_step;
262 outbuf = ymalloc(alloc_step);
263 out = outbuf;
264 outleft = alloc_step;
265
266 do {
267 if (!outleft) {
268 outlen += alloc_step; outleft += alloc_step;
269 yrealloc_buf(&outbuf, &out, outlen);
270 }
271 r = iconv(ic, &doc, &inleft, &out, &outleft);
272 if (r == (size_t)-1) {
273 if(errno == E2BIG) {
274 outlen += alloc_step; outleft += alloc_step;
275 if (outlen > (strbuf_len(buf) << 3)) {
276 fprintf(stderr, "Buffer grew to much. "
277 "Corrupted document?\n");
278 exit(EXIT_FAILURE);
279 }
280 yrealloc_buf(&outbuf, &out, outlen);
281 continue;
282 } else if ((errno == EILSEQ) || (errno == EINVAL)) {
283 char skip = 1;
284
285 /* advance in source buffer */
286 if ((unsigned char)*doc > 0x80)
287 skip += utf8_length[(unsigned char)*doc - 0x80];
288 doc += skip;
289 inleft -= skip;
290
291 /* advance in output buffer */
292 *out = '?';
293 out++;
294 outleft--;
295
296 continue;
297 }
298 fprintf(stderr, "iconv returned: %s\n", strerror(errno));
299 exit(EXIT_FAILURE);
300 }
301 } while(inleft != 0);
302
303 if (!outleft) {
304 outbuf = yrealloc(outbuf, outlen + 1);
305 }
306 *out = '\0';
307
308 output = strbuf_slurp_n(outbuf, (size_t)(out - outbuf));
309 strbuf_setopt(output, STRBUF_NULLOK);
310 return output;
311 }
312
313 static void subst_doc(iconv_t ic, STRBUF *buf)
314 {
315 struct subst *s = substs;
316 ICONV_CHAR *in;
317 size_t inleft;
318 const size_t outbuf_sz = 20;
319 char *outbuf;
320 char *out;
321 size_t outleft;
322 size_t r;
323
324 if (opt_subst == SUBST_NONE)
325 return;
326
327 outbuf = ymalloc(outbuf_sz);
328 while (s->unicode) {
329 if (opt_subst == SUBST_ALL) {
330 RS_G(s->utf8, s->ascii);
331 } else {
332 out = outbuf;
333 outleft = outbuf_sz;
334 in = (ICONV_CHAR*)s->utf8;
335 inleft = strlen(in);
336 r = iconv(ic, &in, &inleft, &out, &outleft);
337 if (r == (size_t)-1) {
338 if ((errno == EILSEQ) || (errno == EINVAL)) {
339 RS_G(s->utf8, s->ascii);
340 } else {
341 fprintf(stderr,
342 "iconv returned an unexpected error: %s\n",
343 strerror(errno));
344 exit(EXIT_FAILURE);
345 }
346 }
347 }
348 s++;
349 }
350 yfree(outbuf);
351 }
352
353 static char *guess_encoding(void)
354 {
355 char *enc;
356 char *tmp;
357
358 enc = ymalloc(20);
359 #ifdef WIN32
360 snprintf(enc, 20, "CP%u", GetACP());
361 #else
362 tmp = nl_langinfo(CODESET);
363 strncpy(enc, tmp, 20);
364 #endif
365 if(!enc) {
366 fprintf(stderr, "warning: Could not detect console "
367 "encoding. Assuming ISO-8859-1\n");
368 strncpy(enc, "ISO-8859-1", 20);
369 }
370
371 return enc;
372 }
373
374 #endif
375
376 static STRBUF *read_from_zip(const char *zipfile, const char *filename)
377 {
378 int r;
379 STRBUF *content;
380
381 r = kunzip_get_offset_by_name((char*)zipfile, (char*)filename, 3, -1);
382
383 if(-1 == r) {
384 fprintf(stderr,
385 "Can't read from %s: Is it an OpenDocument Text?\n", zipfile);
386 exit(EXIT_FAILURE);
387 }
388
389 content = kunzip_next_tobuf((char*)zipfile, r);
390
391 if (!content) {
392 fprintf(stderr,
393 "Can't extract %s from %s. Maybe the file is corrupted?\n",
394 filename, zipfile);
395 exit(EXIT_FAILURE);
396 }
397
398 return content;
399 }
400
401 static void format_doc(STRBUF *buf)
402 {
403 /* FIXME: Convert buffer to utf-8 first. Are there
404 OpenOffice texts which are not utf8-encoded? */
405
406 /* headline, first level */
407 RS_E("<text:h[^>]*outline-level=\"1\"[^>]*>([^<]*)<[^>]*>", &h1);
408 RS_E("<text:h[^>]*>([^<]*)<[^>]*>", &h2); /* other headlines */
409 RS_G("<text:p [^>]*>", "\n\n"); /* normal paragraphs */
410 RS_G("</text:p>", "\n\n");
411 RS_G("<text:tab/>", " "); /* tabs */
412 RS_G("<text:line-break/>", "\n");
413
414 /* images */
415 RS_E("<draw:frame[^>]*draw:name=\"([^\"]*)\"[^>]*>", &image);
416
417 RS_G("<[^>]*>", ""); /* replace all remaining tags */
418 RS_G("\n +", "\n"); /* remove indentations, e.g. kword */
419 RS_G("\n{3,}", "\n\n"); /* remove large vertical spaces */
420
421 RS_G("'", "'"); /* common entities */
422 RS_G("&", "&");
423 RS_G(""", "\"");
424 RS_G(">", ">");
425 RS_G("<", "<");
426
427 RS_O("^\n+", ""); /* blank lines at beginning and end of document */
428 RS_O("\n{2,}$", "\n");
429 }
430
431 int main(int argc, const char **argv)
432 {
433 struct stat st;
434 iconv_t ic;
435 STRBUF *wbuf;
436 STRBUF *docbuf;
437 STRBUF *outbuf;
438 int i = 1;
439
440 (void)setlocale(LC_ALL, "");
441
442 while (argv[i]) {
443 if (!strcmp(argv[i], "--raw")) {
444 opt_raw = 1;
445 i++; continue;
446 } else if (!strncmp(argv[i], "--encoding=", 11)) {
447 size_t arglen = strlen(argv[i]) - 10;
448 #ifdef iconvlist
449 if (!strcmp(argv[i] + 11, "list")) {
450 show_iconvlist();
451 }
452 #endif
453 opt_encoding = ymalloc(arglen);
454 memcpy(opt_encoding, argv[i] + 11, arglen);
455 i++; continue;
456 } else if (!strncmp(argv[i], "--width=", 8)) {
457 opt_width = atoi(argv[i] + 8);
458 if(opt_width < 3 && opt_width != -1) {
459 fprintf(stderr, "Invalid value for width: %s\n",
460 argv[i] + 8);
461 exit(EXIT_FAILURE);
462 }
463 i++; continue;
464 } else if (!strcmp(argv[i], "--force")) {
465 // ignore this setting
466 i++; continue;
467 } else if (!strncmp(argv[i], "--output=", 9)) {
468 if (*(argv[i] + 9) != '-') {
469 size_t arglen = strlen(argv[i]) - 8;
470 opt_output = ymalloc(arglen);
471 memcpy(opt_output, argv[i] + 9, arglen);
472 }
473 i++; continue;
474 } else if (!strncmp(argv[i], "--subst=", 8)) {
475 if (!strcmp(argv[i] + 8, "none"))
476 opt_subst = SUBST_NONE;
477 else if (!strcmp(argv[i] + 8, "some"))
478 opt_subst = SUBST_SOME;
479 else if (!strcmp(argv[i] + 8, "all"))
480 opt_subst = SUBST_ALL;
481 else {
482 fprintf(stderr, "Invalid value for --subst: %s\n",
483 argv[i] + 8);
484 exit(EXIT_FAILURE);
485 }
486 i++; continue;
487 } else if (!strcmp(argv[i], "--help")) {
488 usage();
489 } else if (!strcmp(argv[i], "--version")
490 || !strcmp(argv[i], "-v")) {
491 version_info();
492 } else if (!strcmp(argv[i], "-")) {
493 usage();
494 } else {
495 if(opt_filename)
496 usage();
497 opt_filename = argv[i];
498 i++; continue;
499 }
500 }
501
502 if(opt_encoding && !strcmp("show", opt_encoding)) {
503 yfree(opt_encoding);
504 opt_encoding = guess_encoding();
505 printf("%s\n", opt_encoding);
506 yfree(opt_encoding);
507 exit(EXIT_SUCCESS);
508 }
509
510 if(opt_raw)
511 opt_width = -1;
512
513 if(!opt_filename)
514 usage();
515
516 if(!opt_encoding) {
517 opt_encoding = guess_encoding();
518 }
519
520 ic = init_conv("UTF-8", opt_encoding);
521
522 if (0 != stat(opt_filename, &st)) {
523 fprintf(stderr, "%s: %s\n",
524 opt_filename, strerror(errno));
525 exit(EXIT_FAILURE);
526 }
527
528 /* read content.xml */
529 docbuf = read_from_zip(opt_filename, "content.xml");
530
531 if (!opt_raw) {
532 subst_doc(ic, docbuf);
533 format_doc(docbuf);
534 }
535
536 wbuf = wrap(docbuf, opt_width);
537
538 /* remove all trailing whitespace */
539 (void) regex_subst(wbuf, " +\n", _REG_GLOBAL, "\n");
540
541 outbuf = conv(ic, wbuf);
542
543 if (opt_output)
544 write_to_file(outbuf, opt_output);
545 else
546 fwrite(strbuf_get(outbuf), strbuf_len(outbuf), 1, stdout);
547
548 finish_conv(ic);
549 strbuf_free(wbuf);
550 strbuf_free(docbuf);
551 strbuf_free(outbuf);
552 #ifndef NO_ICONV
553 yfree(opt_encoding);
554 #endif
555 if (opt_output)
556 yfree(opt_output);
557
558 return EXIT_SUCCESS;
559 }
560
561 static void write_to_file(STRBUF *outbuf, const char *filename)
562 {
563 int fd;
564 ssize_t len;
565
566 fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
567 if (fd == -1) {
568 fprintf(stderr, "Can't open %s: %s\n", filename, strerror(errno));
569 exit(EXIT_FAILURE);
570 }
571
572 len = write(fd, strbuf_get(outbuf), strbuf_len(outbuf));
573 if (len == -1) {
574 fprintf(stderr, "Can't write to %s: %s\n", filename, strerror(errno));
575 exit(EXIT_FAILURE);
576 }
577
578 close(fd);
579 }
580
581
582 #ifdef iconvlist
583 static int print_one (unsigned int namescount, const char * const * names,
584 void *data)
585 {
586 int i;
587
588 for (i = 0; i < namescount; i++) {
589 if (i > 0)
590 putc(' ',stdout);
591 fputs(names[i],stdout);
592 }
593 putc('\n',stdout);
594 return 0;
595 }
596
597 static void show_iconvlist() {
598 iconvlist(print_one, NULL);
599 exit(EXIT_SUCCESS);
600 }
601 #endif