summaryrefslogtreecommitdiffstats
path: root/tkhtml1/tools/getpage.c
blob: 2b2bc5676718a04e82376615fbf7a2a389d8a53a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/*
** This is a simple program used to retrieve an HTML document using
** HTTP.  The program also fetches all images that the document
** references.
*/
#include <stdio.h>
#include <stdlib.h>
#include "getpage.h"

#define stricmp strcasecmp


/*
** Each image to be loaded is an instance of the following structure.
*/
typedef struct Image Image;
struct Image {
  char *zUrl;      /* The URL for this image */
  char *zLocal;    /* The local filename */
  Image *pNext;    /* Next in a list of them all */
};

static FILE *html;        /* Html output to this file. */
static int nImage = 0;    /* Number of images loaded so far */
static Image *pImage;     /* List of all images */
static global_nErr = 0;   /* System wide errors */
static char baseUrl[1000];/* The base URL */
static int quiet = 0;     /* The quiet flag */

/*
** Make sure the given URL is loaded as a local file.  Return the
** name of the local file.
*/
static char *GetImage(char *zUrl){
  Image *p;
  for(p=pImage; p; p=p->pNext){
    if( strcmp(p->zUrl,zUrl)==0 ){
      return p->zLocal;
    }
  }
  p = malloc( sizeof(*p) + strlen(zUrl) + 100 );
  p->zUrl = (char*)&p[1];
  strcpy(p->zUrl, zUrl);
  p->zLocal = &p->zUrl[strlen(zUrl)+1];
  sprintf(p->zLocal,"image%d", ++nImage);
  p->pNext = pImage;
  pImage = p;
  HttpFetch(zUrl, p->zLocal, quiet, 0, 0);
  return p->zLocal;
}

/*
** Print a usage comment and exit
*/
void usage(char *argv0){
  fprintf(stderr,"Usage: %s URL\n",argv0);
  exit(1);
}

/*
** Handle anything that isn't markup
*/
static void WordHandler(const char *zText, void *notUsed){
  fprintf(html, zText);
}

/*
** Handle all markup that we don't care about.
*/
static void DefaultMarkup(int argc, const char **argv, void *notUsed){
  int i;
  fprintf(html,"<%s",argv[0]);
  for(i=1; i<argc-1; i+=2){
    fprintf(html," %s=\"%s\"", argv[i], argv[i+1]);
  }
  fprintf(html,">");
}

/*
** Handler for <IMG> markup
*/
static void ImageMarkup(int argc, const char **argv, void *notUsed){
  int i;
  for(i=1; i<argc-1; i+=2){
    if( stricmp(argv[i],"src")==0 ){
      const char *azUrl[2];
      char *zResolved;
      azUrl[0] = argv[i+1];
      azUrl[1] = 0;
      zResolved = ResolveUrl(baseUrl, azUrl);
      if( !quiet ){
        printf("Resolved: (%s) (%s) -> (%s)\n",baseUrl, azUrl[0], zResolved);
      }
      argv[i+1] = GetImage(zResolved);
      /* printf("%s -> %s -> argv[i+1]\n",argv[i+1], zResolved); */
      free(zResolved);
    }
  }
  DefaultMarkup(argc, argv, 0);
}

/*
** Handler for <BASE> markup
*/
static void BaseMarkup(int argc, const char **argv, void *notUsed){
  int i;
  for(i=1; i<argc-1; i+=2){
    if( stricmp(argv[i],"href")==0 ){
      if( !quiet ){
        printf("Base Href=%s\n",argv[i+1]);
      }
      sprintf(baseUrl,"%.*s", sizeof(baseUrl), argv[i+1]);
    }
  }
}

/*
** Name of a temporary file
*/
static char zTemp[] = "index.html.orig";

/*
** The main routine
*/
int main(int argc, char **argv){
  int i;                 /* Loop counter */
  int nErr;              /* Number of errors */
  int rc;                /* Result code */
  char *zUrl = 0;        /* The URL */
  FILE *in;              /* For reading the raw html */

  if( argc<2 ) usage(argv[0]);
  zUrl = 0;
  for(i=1; i<argc; i++){
    if( strcmp(argv[i],"-quiet")==0 ){
      quiet = 1;
    }else if( argv[i][0]=='-' ){
      usage(argv[0]);
    }else{
      zUrl = argv[i];
    }
  }
  if( zUrl==0 ) usage(argv[0]);
  rc = HttpFetch(zUrl, zTemp, quiet, sizeof(baseUrl), baseUrl);
  if( rc!=200 ){
    unlink(zTemp);
    fprintf(stderr,"Unable to fetch base page %s\n", zUrl);
    exit(1);
  }
  in = fopen(zTemp,"r");
  /* unlink(zTemp); */
  if( in==0 ){
    perror("can't reopen temporary file!");
    exit(1);
  }
  html = fopen("index.html","w");
  if( html==0 ){
    perror("can't open output file \"index.html\"");
    exit(1);
  }
  SgmlWordHandler(WordHandler);
  SgmlSpaceHandler(WordHandler);
  SgmlCommentHandler(WordHandler);
  SgmlDefaultMarkupHandler(DefaultMarkup);
  SgmlHandler("img", ImageMarkup);
  SgmlHandler("base", BaseMarkup);
  SgmlParse(in, 0);
  fclose(in);
  fclose(html);
  return global_nErr;
}