Old file.c
  1 /*
  2  * file.c
  3  */
  4 
  5 /*
  6  * mpage:    a program to reduce pages of print so that several pages
  7  *           of output appear on one printed page.
  8  *
  9  * Copyright (c) 1994-2004 Marcel J.E. Mol, The Netherlands
 10  * Copyright (c) 1988 Mark P. Hahn, Herndon, Virginia
 11  *  
 12  *     Permission is granted to anyone to make or distribute verbatim
 13  *     copies of this document as received, in any medium, provided
 14  *     that this copyright notice is preserved, and that the
 15  *     distributor grants the recipient permission for further
 16  *     redistribution as permitted by this notice.
 17  *
 18  */
 19 
 20 
 21 #include "mpage.h"
 22 
 23 
 24 static int looks_utf8(FILE *fp);
 25 
 26 /*
 27  * do_file converts one file into postscript for output.  The file type is
 28  * determined then the proper conversion routine is selected.
 29  */
 30 void
 31 do_file(fname, asheet, outfd)
 32  char *fname;
 33  struct sheet *asheet;
 34  FILE *outfd;
 35 {
 36     FILE *fd;
 37     int in_type;
 38 
 39     /*
 40      * Open fname and try to figure out what type of file it is
 41      */
 42     if ((fd = fopen(fname, "r")) == NULL) {
 43         fprintf(stderr, "%s: cannot open %s\n", MPAGE, fname);
 44         perror(MPAGE);
 45         return;
 46     }
 47 
 48     /*
 49      * if we have the pr option, then we have to assume it's a text file
 50      */
 51     if (opt_pr || opt_input == IN_ASCII) 
 52         in_type = IN_ASCII;
 53     else {
 54         /*
 55          * check for the cutomary characters that flag a postscript file
 56          */
 57         if (ps_check(fd))
 58             in_type = IN_PS;
 59         else
 60             in_type = IN_ASCII;
 61     }
 62 
 63     /*
 64      * For text input check if input is UTF-8 or not
 65      */
 66     if (in_type == IN_ASCII && check_utf8 && looks_utf8 (fd))
 67         use_utf8 = 1;
 68     
 69     (void) fclose(fd);
 70 
 71     if (opt_pr) {
 72         do_pr_file(fname, asheet, outfd);
 73         return;
 74     }
 75 
 76 
 77     /*
 78      * if not using pr(1), open fname and run th file trough the
 79      * specific processor.
 80      */
 81     if ((fd = fopen(fname, "r")) == NULL) {
 82         fprintf(stderr, "%s: cannot open %s\n", MPAGE, fname);
 83         perror(MPAGE);
 84         return;
 85     }
 86 
 87     switch (in_type) {
 88         case IN_ASCII:  do_text_doc(fd, asheet, outfd, fname);
 89                         break;
 90         case IN_PS:     do_ps_doc(fd, asheet, outfd, fname);
 91                         break;
 92         /* Default figure out ourselfes */
 93     }
 94 
 95     (void) fclose(fd);
 96 
 97     return;
 98 
 99 } /* do_file */
100 
101 
102 
103 /*
104  * do_pr_file processes one text file into postscript, but first runs the file
105  * through pr(1).
106  */
107 void
108 do_pr_file(fname, asheet, outfd)
109  char *fname;
110  struct sheet *asheet;
111  FILE *outfd;
112 {
113     FILE *fd;
114     char command[LINESIZE];
115 
116     /*
117      * build the proper command based upon a specified
118      * header or not
119      */
120 #define DASHES "-- "
121     if (opt_header != NULL)
122         (void)sprintf(command, "%s -l%d -w%d -h \"%s\" %s%s", prprog,
123                   asheet->sh_plength, asheet->sh_cwidth, opt_header,
124                   fname[0] == '-' ? DASHES : "", fname);
125     else
126         (void)sprintf(command, "%s -l%d -w%d %s%s", prprog,
127                   asheet->sh_plength, asheet->sh_cwidth,
128                   fname[0] == '-' ? DASHES : "", fname);
129     /*
130      * open a pipe to the proper pr(1) command, and pr provides
131      * us with the input
132      */
133     if ((fd = popen(command, "r")) == NULL) {
134         fprintf(stderr, "%s: cannot create pipe for '%s'\n", MPAGE, command);
135         perror(MPAGE);
136     }
137     else {
138         do_text_doc(fd, asheet, outfd, fname);
139         (void)pclose(fd);
140     }
141 
142     return;
143 
144 } /* do_pr_file */
145 
146 
147 
148 /*
149  * do_stdin uses do_????_doc to process the standard input
150  */
151 void
152 do_stdin(asheet, outfd)
153  struct sheet *asheet;
154  FILE *outfd;
155 {
156 #if 1
157     FILE *fd;
158     char buffer[LINESIZE];
159     char tmpfile[LINESIZE];
160     int incnt, outcnt;
161     int tmpfd;
162 
163     /*
164      * Now the utf8 patch is in we always create a temporary file.
165      * So now is the time to just create a temp file and continue
166      * as if a filename was passed. This has some minor change
167      * on the output pages as it does nit show <stdin> anymore
168      * but the tmpfilename
169      */
170      
171     (void) strcpy(tmpfile, "/tmp/mpage-stdin-XXXXXX");
172     if ( (tmpfd = mkstemp(tmpfile)) == -1) {
173         fprintf(stderr, "%s: cannot create temporary file", MPAGE);
174         perror(MPAGE);
175         return;
176     }
177     close(tmpfd);
178     if ((fd = fopen (tmpfile, "w")) == NULL) {
179         fprintf(stderr, "%s: cannot reopen temporary file", MPAGE);
180         perror(MPAGE);
181         return;
182     } 
183 
184     do {
185         incnt = fread(buffer, 1, sizeof buffer, stdin);
186         outcnt = fwrite(buffer, 1, incnt, fd);
187     } while (incnt && outcnt);
188     (void) fclose(fd);
189 
190     do_file(tmpfile, asheet, outfd);
191 
192     (void) unlink(tmpfile);
193 
194     return;
195 
196 #else
197 
198     FILE *fd;
199     char command[LINESIZE];
200     char tmpfile[LINESIZE];
201     char buffer[LINESIZE];
202     int incnt, outcnt;
203     int tmpfd;
204     if (opt_pr) {
205         Debug(DB_STDIN, "%%do_stdin: pr option selects text\n", 0);
206         /*
207          * if pr(1) is to be used we need to read the input
208          * and pass it to a pr(1) command which will write
209          * a temporary file; this temporary file will then
210          * be used as input to the do_doc routine
211          */
212         (void)strcpy(tmpfile, "/tmp/mpageXXXXXX");
213         if ( (tmpfd = mkstemp(tmpfile)) == -1) {
214             fprintf(stderr, "%s: cannot create temporary file", MPAGE);
215             perror(MPAGE);
216             return;
217         }
218         close(tmpfd);
219         if (opt_header != NULL)
220             (void)sprintf(command, "%s -l%d -w%d -h \"%s\" > %s", prprog,
221                       asheet->sh_plength, asheet->sh_cwidth,
222                       opt_header, tmpfile);
223         else
224             (void)sprintf(command, "%s -l%d -w%d > %s", prprog,
225                       asheet->sh_plength, asheet->sh_cwidth, tmpfile);
226         /*
227          * open a pipe to the pr(1) command which will create a
228          * temporary file for convertin into PS
229          */
230         if ((fd = popen(command, "w")) == NULL) {
231             fprintf(stderr, "%s: cannot create pipe for '%s'\n",
232                 MPAGE, command);
233             perror(MPAGE);
234             return;
235         }
236 #ifdef DEBUG
237         errno = 0;
238         Debug(DB_STDIN, "%% sizeof buffer == %d\n", sizeof buffer);
239 #endif
240         /*
241          * read input to mpage and pass it onto the pr(1) command
242          */
243         do {
244             incnt = fread(buffer, 1, sizeof buffer, stdin);
245             outcnt = fwrite(buffer, 1, incnt, fd);
246             Debug(DB_STDIN, "%% incnt == %d,", incnt);
247             Debug(DB_STDIN, " outcnt == %d,", outcnt);
248             Debug(DB_STDIN, " errno == %d\n", errno);
249         } while (incnt && outcnt);
250         Debug(DB_STDIN, "%% Done with while\n", 0);
251         (void)pclose(fd);
252         Debug(DB_STDIN, "%% closed pipe, looking for tmpfile\n", 0);
253         /*
254          * now open the temporary file and use do_doc to
255          * convert it to PS
256          */
257         if ((fd = fopen(tmpfile, "r")) == NULL) {
258             fprintf(stderr, "%s: cannot open %s\n", MPAGE, tmpfile);
259             perror(MPAGE);
260         }
261         else {
262             /*
263              * check if the input is UTF-8 or not
264              */
265             if (looks_utf8 (fd))
266                 use_utf8 = 1;
267             Debug(DB_STDIN, "%% got tmpfile, now do_doc\n", 0);
268             do_text_doc(fd, asheet, outfd, command);
269             (void)fclose(fd);
270         }
271         /*
272          * tidy up by removing our temp file
273          */
274         Debug(DB_STDIN, "%% now remove '%s'\n", tmpfile);
275         (void)unlink(tmpfile);
276     }
277     else {
278         FILE *tfd;
279         int dont_close = 0;
280 
281         /*
282          * store the input to the temporary file to guess encoding correctly
283          */
284         (void)strcpy(tmpfile, "/tmp/mpageXXXXXX");
285         if ( (tmpfd = mkstemp(tmpfile)) == -1) {
286             fprintf(stderr, "%s: cannot create temporary file", MPAGE);
287             tmpfile[0] = 0;
288         }
289         close(tmpfd);
290         if (tmpfile[0] && (tfd = fopen (tmpfile, "w"))) {
291             do {
292                 incnt = fread(buffer, 1, sizeof buffer, stdin);
293                 outcnt = fwrite(buffer, 1, incnt, tfd);
294             } while (incnt && outcnt);
295             fclose (tfd);
296             if ((fd = fopen(tmpfile, "r")) == NULL) {
297                 fprintf(stderr, "%s: cannot open %s\n", MPAGE, tmpfile);
298                 perror(MPAGE);
299                 /* we should already read the input from stdin.
300                  * so probably it can't recovers
301                  */
302                 return;
303             }
304         } else {
305             /* try to use stdin */
306             fd = stdin;
307             dont_close = 1;
308         }
309         /*
310          * check that the input is whether UTF-8 or not.
311          */
312         if (looks_utf8 (fd))
313             use_utf8 = 1;
314         /*
315          * check for the cutomary flag at the start of postscript files
316          */
317         if (ps_check(fd)) {
318             /*
319              * found the flag signaling PS input
320              */
321             Debug(DB_STDIN, "%%do_stdin: is postscript\n", 0);
322             do_ps_doc(fd, asheet, outfd, "stdin");
323         }
324         else {
325             /*
326              * no postscript flag, print the ascii text
327              */
328             Debug(DB_STDIN, "%%do_stdin: not postscript\n", 0);
329             do_text_doc(fd, asheet, outfd, "stdin");
330         }
331         if (!dont_close)
332             fclose (fd);
333         /* remove the temporary file */
334         if (tmpfile[0])
335             (void)unlink(tmpfile);
336     }
337 
338     return;
339 #endif
340 
341 } /* do_stdin */
342 
343 
344 
345 /*
346  * iswanted () returns 1 if the specified page needs to be printed.
347  *             returns 0 if not.
348  */
349 int
350 iswanted(int sn)
351 {
352     int i;
353 
354     Debug(DB_STDIN, "%%iswanted: opt_jarg: %d\n", opt_jarg);
355     Debug(DB_STDIN, "%%iswanted: sn: %d\n", sn);
356     if (!opt_jarg) {
357         Debug(DB_STDIN, "%%iswanted: wanted page %d\n", sn);
358         ps_outpages++;
359         return 1;
360     }
361     for (i = 0; i < opt_jarg; i++) {
362         Debug(DB_STDIN, "%%iswanted: i: %d\n", i);
363         Debug(DB_STDIN, "%%iswanted: opt_first[i]: %d\n", opt_first[i]);
364         Debug(DB_STDIN, "%%iswanted: opt_alt[i]: %d\n", opt_alt[i]);
365         Debug(DB_STDIN, "%%iswanted: opt_last[i]: %d\n", opt_last[i]);
366         if ((sn >= opt_first[i] && (opt_alt[i] <= 1 || (sn - opt_first[i]) % opt_alt[i] == 0) ) &&
367             (sn <= opt_last[i])) {
368             Debug(DB_STDIN, "%%iswanted: wanted page %d\n", sn);
369             ps_outpages++;
370             return 1;
371         }
372     }
373     Debug(DB_STDIN, "%%iswanted: unwanted page %d\n", sn);
374 
375     return 0;
376 
377 } /* iswanted */
378 
379 
380 
381 /*
382  * do_sheets() is called from do_xxx_doc() to render the sheets;
383  * it does sheet selection and reversal.
384  */
385 void
386 do_sheets(sheetfunc, inf, asheet, outf)
387     int (*sheetfunc)();
388     FILE *inf;
389     struct sheet *asheet;
390     FILE *outf;
391 {
392     FILE *nullf = NULL;
393     register int sheetno;
394     int max_opt_last;
395 
396     max_opt_last = 0;
397     for (sheetno = 0; sheetno < opt_jarg; sheetno++)
398         if (max_opt_last < opt_last[sheetno])
399             max_opt_last = opt_last[sheetno];
400     if (max_opt_last == 0)
401         max_opt_last = MAXINT;
402  
403     Debug(DB_STDIN, "%%do_sheets: max_opt_last: %d\n", max_opt_last);
404  
405     nullf = fopen("/dev/null", "w");
406  
407     if (opt_reverse) {
408         FILE *revf;
409         long *pagebase;
410         int pageroom;
411 
412         revf = tmpfile();
413         if (revf == NULL) {
414             fprintf(stderr, "%s: can't create temporary file\n", MPAGE);
415             exit(1);
416         }
417         pageroom = 50;
418         pagebase = (long *)malloc(pageroom * sizeof(long));
419         if(pagebase == NULL) {
420             fprintf(stderr, "%s: can't malloc 50 words\n", MPAGE);
421             exit(1);
422         }
423         pagebase[0] = 0;
424 
425         for (sheetno = 1; sheetno <= max_opt_last; ) {
426             if ((*sheetfunc)(inf, asheet, iswanted(sheetno) ? revf : nullf)
427                   == FILE_EOF)
428                 break;
429 
430             if (ferror(revf))
431                 break;
432 
433             pagebase[sheetno++] = ftell(revf);
434             if (sheetno >= pageroom) {
435                 pageroom *= 4;
436                 pagebase = (long *)realloc(pagebase, pageroom * sizeof(long));
437                 if (pagebase == NULL) {
438                     fprintf(stderr, "%s: can't malloc %d words\n",
439                                     MPAGE, pageroom);
440                     exit(1);
441                 }
442         
443             }
444         }
445 
446         if (ferror(revf))
447             fprintf(stderr, "%s: error writing to temporary file\n", MPAGE);
448         else {
449             pagebase[sheetno] = ftell(revf);
450             rewind(revf);
451 
452             while (--sheetno >= 0) {
453                 register int i, n;
454                 char buf[BUFSIZ];
455 
456                 fseek(revf, pagebase[sheetno], 0);
457                 for(i = pagebase[sheetno+1]-pagebase[sheetno]; i>0; i-=n) {
458                     n = i < BUFSIZ ? i : BUFSIZ;
459                     if (fread(buf, n, 1, revf) != 1) {
460                         fprintf(stderr, "%s: Premature EOF on temp file\n",
461                         MPAGE);
462                         break;
463                     }
464                     (void) fwrite(buf, n, 1, outf);
465                 }
466             }
467         }
468         fclose(revf);
469         free(pagebase);
470 
471     }
472     else {
473         /* Normal, non-reversed pages */
474         sheetno = 1;
475         while (sheetno <= max_opt_last &&
476                (*sheetfunc)(inf, asheet, iswanted(sheetno) ?
477                         outf : nullf) != FILE_EOF)
478             sheetno++;
479     }
480 
481     if (nullf)
482         fclose(nullf);
483 
484     return;
485 
486 } /* do_sheets */
487 
488 /*
489  * The below codes are privided for ascmagic.c in file-4.02.
490  * looks_utf8() function are modified to handle the file handle directly.
491  */
492 /*
493  * This table reflects a particular philosophy about what constitutes
494  * "text," and there is room for disagreement about it.
495  *
496  * Version 3.31 of the file command considered a file to be ASCII if
497  * each of its characters was approved by either the isascii() or
498  * isalpha() function.  On most systems, this would mean that any
499  * file consisting only of characters in the range 0x00 ... 0x7F
500  * would be called ASCII text, but many systems might reasonably
501  * consider some characters outside this range to be alphabetic,
502  * so the file command would call such characters ASCII.  It might
503  * have been more accurate to call this "considered textual on the
504  * local system" than "ASCII."
505  *
506  * It considered a file to be "International language text" if each
507  * of its characters was either an ASCII printing character (according
508  * to the real ASCII standard, not the above test), a character in
509  * the range 0x80 ... 0xFF, or one of the following control characters:
510  * backspace, tab, line feed, vertical tab, form feed, carriage return,
511  * escape.  No attempt was made to determine the language in which files
512  * of this type were written.
513  *
514  *
515  * The table below considers a file to be ASCII if all of its characters
516  * are either ASCII printing characters (again, according to the X3.4
517  * standard, not isascii()) or any of the following controls: bell,
518  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
519  *
520  * I include bell because some programs (particularly shell scripts)
521  * use it literally, even though it is rare in normal text.  I exclude
522  * vertical tab because it never seems to be used in real text.  I also
523  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
524  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
525  * character to.  It might be more appropriate to include it in the 8859
526  * set instead of the ASCII set, but it's got to be included in *something*
527  * we recognize or EBCDIC files aren't going to be considered textual.
528  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
529  * and Latin characters, so these should possibly be allowed.  But they
530  * make a real mess on VT100-style displays if they're not paired properly,
531  * so we are probably better off not calling them text.
532  *
533  * A file is considered to be ISO-8859 text if its characters are all
534  * either ASCII, according to the above definition, or printing characters
535  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
536  *
537  * Finally, a file is considered to be international text from some other
538  * character code if its characters are all either ISO-8859 (according to
539  * the above definition) or characters in the range 0x80 ... 0x9F, which
540  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
541  * consider to be printing characters.
542  */
543 
544 #define F 0   /* character never appears in text */
545 #define T 1   /* character appears in plain ASCII text */
546 #define I 2   /* character appears in ISO-8859 text */
547 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
548 
549 static char text_chars[256] = {
550         /*                  BEL BS HT LF    FF CR    */
551         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
552         /*                              ESC          */
553         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
554         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
555         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
556         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
557         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
558         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
559         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
560         /*            NEL                            */
561         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
562         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
563         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
564         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
565         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
566         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
567         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
568         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
569 };
570 
571 static int
572 looks_utf8(FILE *fp)
573 {
574         long whence, nbytes;
575         char *buf = NULL;
576         int i, n;
577         unsigned long c;
578         int gotone = 0;
579 
580         /* memorize current position */
581         whence = ftell (fp);
582         /* check the input size */
583         fseek (fp, 0L, SEEK_END);
584         nbytes = ftell (fp) - whence;
585         /* allocate memories */
586         buf = (char *) malloc (sizeof (char) * nbytes + 1);
587         buf[nbytes] = 0;
588         /* rewind the position */
589         fseek (fp, 0L, whence);
590         /* read data */
591         fread (buf, sizeof (char), nbytes, fp);
592         /* rewind the position again */
593         fseek (fp, 0L, whence);
594 
595         for (i = 0; i < nbytes; i++) {
596                 if ((buf[i] & 0x80) == 0) {    /* 0xxxxxxx is plain ASCII */
597                         /*
598                          * Even if the whole file is valid UTF-8 sequences,
599                          * still reject it if it uses weird control characters.
600                          */
601 
602                         if (text_chars[(int)buf[i]] != T)
603                                 return 0;
604 
605                         /* no need to store it
606                          * ubuf[(*ulen)++] = buf[i];
607                          */
608                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
609                         return 0;
610                 } else {                           /* 11xxxxxx begins UTF-8 */
611                         int following;
612 
613                         if ((buf[i] & 0x20) == 0) {         /* 110xxxxx */
614                                 c = buf[i] & 0x1f;
615                                 following = 1;
616                         } else if ((buf[i] & 0x10) == 0) {  /* 1110xxxx */
617                                 c = buf[i] & 0x0f;
618                                 following = 2;
619                         } else if ((buf[i] & 0x08) == 0) {  /* 11110xxx */
620                                 c = buf[i] & 0x07;
621                                 following = 3;
622                         } else if ((buf[i] & 0x04) == 0) {  /* 111110xx */
623                                 c = buf[i] & 0x03;
624                                 following = 4;
625                         } else if ((buf[i] & 0x02) == 0) {  /* 1111110x */
626                                 c = buf[i] & 0x01;
627                                 following = 5;
628                         } else
629                                 return 0;
630 
631                         for (n = 0; n < following; n++) {
632                                 i++;
633                                 if (i >= nbytes)
634                                         goto done;
635 
636                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
637                                         return 0;
638 
639                                 c = (c << 6) + (buf[i] & 0x3f);
640                         }
641 
642                         /* no need to store it
643                          * ubuf[(*ulen)++] = c;
644                          */
645                         gotone = 1;
646                 }
647         }
648 done:
649         if (buf)
650           free (buf);
651 
652         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
653 }