/* query.c -- apply a boolean query to a keyword file */
/* Bruce Tanner -- Cerritos College */
/* Version history:
Modifications by Richard D. Piccard -- Ohio University
2000/07/07 Add home_page function immediately before sorting, to
change the ranking of the hits based on several
criteria. The numerical values can be customized
readily, and two environment variables can be used
to suppress particular hosts or paths.
2000/07/10 Use include files custom_head.c and custom_foot.c for
cosmetic and site-specific changes to the HTML at
the top and bottom of the generated page.
2000/08/01 Add capability for REQUIRED_REALM to home_page.
2000/08/24 Removed \n prior to > closing tags, and abolished
\n
\n
\n
\n");
#include "custom_research.c"
#include "custom_foot.c"
}
return;
}
/*
* Modify the scores to elevate the ranking of home pages (using the path)
* and to elevate the ranking of pages where the hit was early enough on the
* page that it might have been from meta-tagged keywords or description.
* Based on www_emit (there may be some relic lines of code that are not
* needed). - RDP
*/
static void home_page (Result result)
{
int ind, rawplace, rawscore, start, end, bytes, status;
int slashcheck, slashcount;
struct stat buf;
time_t elapsed;
struct tm *local;
char cdate[26];
#define MMM cdate+4
#define DD cdate+8
#define HHMM cdate+11
#define YY cdate+22
unsigned short send_raw = FALSE;
unsigned short use_ipath_info = FALSE;
char *cp, *cp1, *ptr1, *name, *path;
char score[10], ptype[25], method[10], bad_host[250], bad_path[250];
char required_realm[250], this_url[250];
char *title, *file;
int new_style;
char *file_name, size[128];
char items_found[256], items_listed[256];
char ipath_info[SPEC_SIZE];
char ipath_translated[SPEC_SIZE], alt_ipath_translated[SPEC_SIZE];
char *alt_path=NULL;
char gtype;
char *host, *port;
FILE *fp;
int bad_path_cut, bad_host_cut, slash_max, slash_add;
int place_top, top_boost, place_early, place_scale, home_boost;
/*
* Initialize cut and boost parameters
*/
if ((cp=getenv("WWW_BAD_PATH_CUT")) != NULL) {
bad_path_cut = atoi(cp);
} else {
bad_path_cut = 2;
}
if ((cp=getenv("WWW_BAD_HOST_CUT")) != NULL) {
bad_host_cut = atoi(cp);
} else {
bad_host_cut = 2;
}
if ((cp=getenv("WWW_SLASH_MAX")) != NULL) {
slash_max = atoi(cp);
} else {
slash_max = 4;
}
if ((cp=getenv("WWW_SLASH_ADD")) != NULL) {
slash_add = atoi(cp);
} else {
slash_add = 50;
}
if ((cp=getenv("WWW_PLACE_TOP")) != NULL) {
place_top = atoi(cp);
} else {
place_top = 50;
}
if ((cp=getenv("WWW_TOP_BOOST")) != NULL) {
top_boost = atoi(cp);
} else {
top_boost = 50;
}
if ((cp=getenv("WWW_PLACE_EARLY")) != NULL) {
place_early = atoi(cp);
} else {
place_early = 150;
}
if ((cp=getenv("WWW_PLACE_SCALE")) != NULL) {
place_scale = atoi(cp);
} else {
place_scale = 2;
}
if ((cp=getenv("WWW_HOME_BOOST")) != NULL) {
home_boost = atoi(cp);
} else {
home_boost = 400;
}
/*
* Check for deprecated path
*/
if ((cp=getenv("WWW_DEPRECATED_PATH")) != NULL) {
if ((strlen(cp) <= 249) && (strlen(cp) >= 4)) {
strcpy(bad_path, cp);
} else {
too_bad(BAD_PATH_LENGTH);
exit(1);
}
} else {
bad_path[0] = '\0';
}
/*
* Check for deprecated host
*/
if ((cp=getenv("WWW_DEPRECATED_HOST")) != NULL) {
if ((strlen(cp) <= 249) && (strlen(cp) >= 4)) {
strcpy(bad_host, cp);
} else {
too_bad(BAD_HOST_LENGTH);
exit(1);
}
} else {
bad_host[0] = '\0';
}
/*
* Check for required realm
*/
if ((cp=getenv("WWW_REQUIRED_REALM")) == NULL) {
required_realm[0] = '\0';
} else {
if (strlen(cp) <= 2) {
required_realm[0] ='\0';
} else {
if (strlen(cp) <= 249) {
strcpy(required_realm, cp);
} else {
too_bad(BAD_REALM_LENGTH);
exit(1);
}
}
}
/*
* Check whether to substitute ipath_info for ipath_translated
* in hit lists, and if so, set up the translation.
*/
if (((cp=getenv("WWW_OMIT_IPATH_CONVERSION")) == NULL) &&
( cp=getenv("WWW_IPATH_INFO") ) != NULL) {
if (strlen(cp) > SPEC_SIZE)
cp[SPEC_SIZE] = '\0';
strcpy(ipath_info, cp);
if ((cp=getenv("WWW_IPATH_TRANSLATED")) != NULL) {
if (strlen(cp) > SPEC_SIZE)
cp[SPEC_SIZE] = '\0';
strcpy(ipath_translated, cp);
/*
* Make sure we handle both root (device/000000/)
* and subdirectory (device/foo/) paths.
*/
if ((cp=strstr(ipath_translated, "000000/")) != NULL) {
*cp = '\0';
strcpy(alt_ipath_translated, ipath_translated);
*cp = '0';
strcat(alt_ipath_translated, (cp+7));
} else {
alt_ipath_translated[0] = '\0';
}
use_ipath_info = TRUE;
}
}
/* Go through the selectors one at a time. */
for (ind = 0; ind < result.count; ind++) {
find_selector(result.select[ind]);
ptr1 = (char *) (selrab.rab$l_ubf +
index_size[result.select[ind].file]);
parse_selector(ptr1, &new_style, &name, >ype, ptype,
&file_name, &host, &port, method, &path);
status = -1;
bytes = 0;
/* Get the path field for the URL. */
if (ptype[0] != 'R') {
/* No range is indicated. Use a direct URL. */
if (strlen(path) == 0) /* no path given in selector */
path = HTVMS_wwwName(file_name); /* use file name */
} else {
/* Get file_name and number of bytes from the Range selector. */
sscanf(ptype+1, "%d-%d", &start, &end);
bytes = (end - start) + 1;
if (send_raw == TRUE) {
/* Ignore the Range and use a direct URL. */
ptype[0] = '\0';
if (strlen(path) == 0) /* no path given in selector */
path = HTVMS_wwwName(file_name); /* use file name */
} else if (use_ipath_info == TRUE) {
/* Include the Range but convert the pathspec. */
path = HTVMS_wwwName(file_name);
} else {
/* We're including a Range and VMS pathspecs so hex escape. */
alt_path = URLescape(file_name);
path = alt_path;
}
}
/*
* Check for ipath_translated or alt_ipath_translated
* and substitute ipath_info (if set).
*/
if (use_ipath_info == TRUE) {
alt_path = (char *)malloc(strlen(path) + strlen(ipath_info) + 1);
if (strncasecomp(path,
ipath_translated,
strlen(ipath_translated)) == 0) {
strcpy(alt_path, ipath_info);
strcat(alt_path, (char*)&path[strlen(ipath_translated)]);
path = alt_path;
} else if(alt_ipath_translated[0] != '\0' &&
strncasecomp(path,
alt_ipath_translated,
strlen(alt_ipath_translated)) == 0) {
strcpy(alt_path, ipath_info);
strcat(alt_path, (char*)&path[strlen(alt_ipath_translated)]);
path = alt_path;
}
}
/* provide http default if a host is given with no method */
if ((strlen(method) == 0) && strlen(host))
strcpy(method, "http");
if (strlen(method))
strcat(method, "://");
/*
* At this point, method, host, port, and path are the obvious parts
* of the HREF that will be created for the link. - RDP
*/
rawscore = result.select[ind].score;
rawplace = result.select[ind].pos;
if ((strlen(path) + strlen(host)) <= (250 - 7 - 6)) {
sprintf(this_url, "%s%s%s%s",
strlen(method) ? method : "",
strlen(host) ? host : "",
strlen(port) ? port : "",
path);
} else {
/* deal with pathologically long URL */
sprintf(this_url, "%s",
strlen(method) ? method : "");
strncat(this_url, host, (250 - strlen(this_url) - 3));
strncat(this_url, port, (250 - strlen(this_url) - 2));
strncat(this_url, path, (250 - strlen(this_url) - 1));
}
if (required_realm[0] == '\0' || (required_realm[0] != '\0' &&
(strstr(this_url,required_realm) != NULL)) ) {
/*
* either no required realm or it matches, so go ahead
* with the boost and cut calculations
*
*/
if (strstr(path,"index.") != NULL ||
strstr(path,"welcome.") != NULL ||
strstr(path,"default.") != NULL ||
path[strlen(path)-1] == '/') {
/*
* we have a home page, folks!
*/
rawscore = rawscore + home_boost; /* home page boost */
}
/*
* Apply boost for early location in page -- primary target
* is keywords and description text from META tags.
*/
if (rawplace <= place_top) rawscore = rawscore + top_boost;
if (rawplace <= place_early) rawscore = rawscore + (place_early - rawplace)/place_scale;
/*
* Apply boost for high-level path (small number of slashes).
*/
slashcount = 0;
for (slashcheck=0; slashcheck < strlen(path); slashcheck++) {
if (path[slashcheck] == '/') slashcount = slashcount + 1;
}
if (slashcount <= slash_max) rawscore = rawscore + slash_add*(slash_max - slashcount);
/*
* Apply cut for pages on deprecated host.
*/
if ((strlen(bad_host) >= 2) && (strstr(host, bad_host) != NULL)) {
rawscore = rawscore/bad_host_cut;
}
/*
* Apply cut for pages on deprecated path.
*/
if ((strlen(bad_path) >= 2) && (strstr(path, bad_path) != NULL)) {
rawscore = rawscore/bad_path_cut;
}
} else {
/*
*
* required realm and it did not match
*
*/
rawscore = 0;
}
result.select[ind].score = rawscore;
if (alt_path != NULL) {
free(alt_path);
alt_path = NULL;
}
}
return;
}
/*
** Redirect or output a client's request for a RANGE fetch.
*/
static void display_result(char *file_name)
{
int start, end, i;
unsigned short omit_pre = FALSE;
char *cp, inputline[SPEC_SIZE];
FILE *fp;
/* Get the range and VMS filename. */
if (orig_qstr[5] == ' ') /** "TEXT= R..." **/
i = 7;
else /** "TEXT=R..." or "TEXT R..." **/
i = 6;
sscanf(orig_qstr+i, "%d-%d-%s", &start, &end, file_name);
/* Force any paths whose device begins with WWW_Root to begin at root? */
if (!strncasecomp(file_name, "WWW_Root", 8) &&
(cp=strstr(file_name, ":[")) != NULL &&
getenv("WWW_FORCE_000000") != NULL) {
cp += 2;
if (*cp && strncmp(cp, "000000", 6)) {
strncpy(inputline, file_name, cp-file_name);
inputline[cp-file_name] = '\0';
strcat(inputline, "000000.");
strcat(inputline, cp);
strcpy(file_name, inputline);
}
}
/* Use Location: for raw files to implement httpd authorization checks. */
/* (still need this for old or gerry-rigged URL's) */
if (getenv("WWW_SEND_RAW_FILE") != NULL) {
printf("Location: %s%s%s\n\n",
strlen(Host) ? "http://" : "",
strlen(Host) ? Host : "",
HTVMS_wwwName(file_name));
return;
}
/* Set up the HTML rendition for document sections. */
if (!omit_content_type)
printf("Content-Type: text/html\n\n");
if (!omit_head) {
printf("\n
\n"); /* Get and send the section with appropriate further packaging. */ if ((cp=getenv("WWW_IPATH_TRANSLATED")) != NULL && getenv("WWW_OMIT_IPATH_CONVERSION") == NULL) fp = fopen(cp, "r", "shr=get", "mbc=32"); else fp = fopen(file_name, "r", "shr=get", "mbc=32"); if (fp == NULL) { printf("\n"); if (!omit_foot) #include "custom_foot.c" } static char *URLescape(char *str) { #define ACCEPTABLE(a) ( a>=32 && a<128 && ((isAcceptable[a-32]) & 1)) char *p; char *q; char *result; int unacceptable = 0; for(p=str; *p; p++) if (!ACCEPTABLE((unsigned char)*p)) unacceptable++; result = (char *) malloc(p-str + unacceptable+ unacceptable + 1); for(q=result, p=str; *p; p++) { unsigned char a = *p; if (!ACCEPTABLE(a)) { *q++ = '%'; /* Means hex commming */ *q++ = hex[a >> 4]; *q++ = hex[a & 15]; } else *q++ = *p; } *q++ = '\0'; /* Terminate */ return(result); } static char *HTVMS_wwwName(char *vmsname) { static char wwwname[SPEC_SIZE]; char *src, *dst; int dir; dst = wwwname; src = vmsname; dir = 0; if (strchr(src,':')) *(dst++) = '/'; for ( ; *src != '\0' ; src++) { switch(*src) { case ':': *(dst++) = '/'; break; case '-': if (dir) { if ((*(src-1)=='[' || *(src-1)=='.' || *(src-1)=='-') && (*(src+1)=='.' || *(src+1)=='-')) { *(dst++) = '/'; *(dst++) = '.'; *(dst++) = '.'; } else *(dst++) = '-'; } else { if (*(src-1) == ']') *(dst++) = '/'; *(dst++) = '-'; } break; case '.': if (dir) { if (*(src-1) != '[') *(dst++) = '/'; } else { if (*(src-1) == ']') *(dst++) = '/'; *(dst++) = '.'; } break; case '[': dir = 1; break; case ']': dir = 0; break; default: if (*(src-1) == ']') *(dst++) = '/'; *(dst++) = *src; break; } } *(dst++) = '\0'; return(wwwname); } static char *FileType(char gtype) { static char filetype[256]; if (getenv("WWW_SHOW_FILETYPE") == NULL) { filetype[0] = '\0'; return(filetype); } switch(gtype) { case '0': case 'R': case 'h': case 'M': sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/text.xbm"); break; case '1': sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/directory.xbm"); break; case '4': sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/binhex.xbm"); break; case '5': case '9': sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/binary.xbm"); break; case '6': sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/uu.xbm"); break; case '7': sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/index.xbm"); break; case 'g': case 'I': sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/image.xbm"); break; case 's': sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/sound.xbm"); break; case ';': sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/movie.xbm"); break; default: sprintf (filetype, " ", strlen(Host) ? "http://" : "", strlen(Host) ? Host : "", "/httpd-internal-icons/unknown.xbm"); break; } return(filetype); } static void too_bad(char *reason) { if (!(switch_present("www") || getenv("WWW_GATEWAY_INTERFACE") != NULL)) { printf("%s\n", reason); /* non HTML error */ return; } /* set up the HTML rendition */ if (!omit_content_type) printf("Content-type: text/html\n\n"); if (!omit_head) { printf("\n\n"); if (TITLE != NULL) printf("ERROR 403
\nUnable to fopen() database."); #include "custom_foot.c" return; } fseek(fp, start, SEEK_SET); if(omit_pre) printf("This is from the document %s\n\n", file_name); else printf("\nThis is from the document %s\n\n", file_name); while (fgets(inputline, sizeof(inputline), fp) != NULL) { printf("%s", inputline); if (ftell(fp) >= end) break; } fclose(fp); if (omit_pre) printf("\n"); else printf("\n