/* ** PROGRAM NAME: RANK.CPL ** ------------- ** Program Source Code File (.CPL) ** Concordance(tm) Information Retrieval System, Professional Edition ** ** COPYRIGHT: ** ---------- ** (C) Copyright Dataflight Software, Inc. 1992. All Rights Reserved. ** ** SYNOPSIS: Ranks documents in order of relevance based on word occurrences. ** --------- ** */ /* Uniform width of document number stored in temporary btree file. */ short WIDTH = 10; /* Error codes. */ short DISKFULLERROR = 1; short REINDEXERROR = 2; short USEREXITERROR = 3; short NOHITSERROR = 4; short OPENDBERROR = 5; short TEMPFILEERROR = 6; short IVTFILEERROR = 7; short DCTFILEERROR = 8; short QUERYZEROERR = 9; short ZERODOCERROR = 10; main() { int db; switch(rank(db)) { case DISKFULLERROR: Message("Disk full encountered while writing temporary file."); break; case REINDEXERROR : Message("This database must be reindexed before ranking."); break; case USEREXITERROR: break; case NOHITSERROR : Message("This search had zero hits in Paragraph fields!"); break; case OPENDBERROR : Message("Please open a database before running rank."); break; case TEMPFILEERROR: Message("Couldn't create temporary .LST file. "); break; case DCTFILEERROR : Message("Couldn't open database dictionary."); break; case IVTFILEERROR : Message("Couldn't open database .IVT file."); break; case ZERODOCERROR : Message("This search located zero documents, nothing to do."); break; case QUERYZEROERR : Message("Please do a search before running rank."); break; case 0: Message("This search has been ranked."); break; } } /* main() */ /**************************************************************** * Name: rank * * Synopsis: Ranks documents using DISCUS algorithm. * ****************************************************************/ rank(int db) { int error, /* Error control variable. */ i,j, /* Misc loop variables. */ document, /* Document number being processed. */ field, /* Field number with word occurrence. */ offset, /* Offset into field of word. */ physicalDocumentNumber, /* Database record number being processed. */ invtHandle, /* Handle to opened .IVT file. */ treeHandle, /* Handle to opened .DCT file. */ weightedTreeHandle, /* Handle to btree of document weights. */ blocks, /* Number of blocks in progress display. */ noDupsAllowed; /* Handle to btree that rejects duplicats. */ char word[33], /* Word to weight, at "offset" in "field." */ treeName[128], /* Name of temporary weightedTreeHandle. */ noDupsAllowedTreeName[128]; float weight, /* Calculated weight of document. */ percent; /* Progress percentage through database. */ text screen; /* Saved screen, restored on exit. */ cursoroff(); /* Do some initial error checking to ensure that a database */ /* is open, that documents were located in a query, and that */ /* a search was actually performed. */ if (db.documents < 0) error = OPENDBERROR; else { if (count(db) <= 0) error = ZERODOCERROR; else { if (db.query == 0) error = QUERYZEROERR; else { /* Open the inverted text file for processing. */ if ((invtHandle = open(db.database+".IVT","r")) == EOF) error = IVTFILEERROR; else { /* Open the database dictionary for processing */ if ((treeHandle = btopen(db.database+".DCT")) == EOF) { close(invtHandle); error = DCTFILEERROR; } else { /* Create a temporary btree file to store the */ /* document numbers and their document weights. */ for(i = 0; i < 200; i = i + 1) if ((weightedTreeHandle = btcreate(treeName = Path(program())+"TEMP"+str(i,4,0,'z')+".LST",0)) >= 0) break; /* Exit if the temporary btree could not be created. */ if (weightedTreeHandle < 0) { close(invtHandle); btclose(treeHandle); error = TEMPFILEERROR; } else { for(i = 0; i < 200; i = i + 1) if ((noDupsAllowed = btcreate(noDupsAllowedTreeName = Path(program())+"TEMP"+str(i,4,0,'z')+".LST",0)) >= 0) break; /* Exit if the temporary duplicate btree could not be created. */ if (noDupsAllowed < 0) { close(invtHandle); btclose(treeHandle); btclose(weightedTreeHandle); error = TEMPFILEERROR; } } } } } } } if (error == FALSE) { /* Initialize the screen for status display */ /* while we are processing the data base. */ screen = save(8,19,17,61); blocks = 37; scroll(8,19,14,59,0,0,MenuColor_); box(8,19,16,61,"DS",MenuColor_); puts(09,26,"ÄÄ DISCUS Ranking Program ÄÄ",MenuColor_); puts(10,21,"Processing document",MenuColor_); puts(11,21,"Database processed",MenuColor_); puts(12,21,"Ranking documents",MenuColor_); puts(14,21,rep('°',blocks),MenuColor_); puts(10,57,"1",MenuColor_); puts(11,57,"0%",MenuColor_); puts(12,57,"0%",MenuColor_); /* Cycle through each document retrieving the field */ /* number and offset of each word that will be ranked. */ for(physicalDocumentNumber = i = first(db,document,field,offset); i >= 0; i = nexthit(db,document,field,offset)) { /* Make sure the document hasn't been edited. If the document has */ /* been edited, the field and offset values are probably invalid. */ if (edited(db)) { error = REINDEXERROR; break; } /* Each time the physical document number, the record number, changes */ /* store the total weight in the btree, reset the weight variable. */ if (physicalDocumentNumber <> i) { if (weight <> 0.0) if (btinsert(weightedTreeHandle,str(physicalDocumentNumber,WIDTH),weight * 1000)) { error = DISKFULLERROR; break; } weight = 0.0; physicalDocumentNumber = i; /* The document number changed, update the status display. */ puts(10,48,str(i = docno(db),10,0,','),MenuColor_); percent = (i*1.0)/count(db); puts(11,48,str(percent*100.0,10),MenuColor_); puts(14,21,rep('²',blocks*percent),MenuColor_); /* Clear all entries from the noDupsAllowed btree. */ for(j = btfirst(noDupsAllowed,word,0); j == 0; j = btfirst(noDupsAllowed,word,0)) btdelete(noDupsAllowed,word); } /* Calculate the weight for this word and add it to the document weight. */ if (field > 0) { if (db.type[field] == 'P') { word = upper(substr(db->field,offset,wordlen(db,addr(db->field,offset)))); /* If we can insert this word, it hasn't been counted yet. */ if (btinsert(noDupsAllowed,word,0) == 0) weight = weight + findWeight(db, invtHandle, treeHandle, word); } } /* See if the user wants to exit the processing loop early. */ if (keypress()) { if (getkey() == ESC) if (Message("Cancel document ranking? Y/N") == 'Y') { error = USEREXITERROR; break; } } } /* Close the .IVT and .DCT files, we are finished with them. */ close(invtHandle); btclose(treeHandle); /* Insert the last document weight when the loop finishes. */ if ((physicalDocumentNumber <> i) and (error == FALSE) and (weight <> 0.0)) if (btinsert(weightedTreeHandle,str(physicalDocumentNumber,WIDTH),weight * 1000)) error = DISKFULLERROR; /* Now sort the list by rank order number. */ if (error == FALSE) { if (btcount(weightedTreeHandle) <= 0) error = NOHITSERROR; else if (sort(db,"rankSort(db,weightedTreeHandle);",12,55,MenuColor_) < 0) { Message("Sort cancelled"); error = USEREXITERROR; } } /* All done, close files, erase the temporary file. */ btclose(weightedTreeHandle); erase(treeName); btclose(noDupsAllowed); erase(noDupsAllowedTreeName); /* Restore the user's screen. */ restore(8,19,screen); } return(error); } /* rank() */ /**************************************************************** * Name: rankSort * * Synopsis: Called by CPL sort() for each document sorted. * * Returns the ranking weight to sort() for use in * * sorting the document. Since the documents are * * ordered in descending order, the dc() function * * is used to create the correct sort order. * ****************************************************************/ rankSort(int db, weightedTreeHandle) { int weight; btfind(weightedTreeHandle,str(recno(db),WIDTH),weight); return(dc(str(weight,6))); } /* rankSort() */ /**************************************************************** * Name: findWeight * * Synopsis: Calculates the weight of a given word using the * * following algorithm: * * # documents in database - # times word occurs in db * * --------------------------------------------------- * * # of documents in database * ****************************************************************/ findWeight(int db, invtHandle, treeHandle; char word[]) { int occurrences; int offset; float weight, count; if (btfind(treeHandle,word,offset) <> 0) Message("ERROR: Failed to locate "+word); else { seek(invtHandle,offset,'B'); read(invtHandle,occurrences,4); /* Perform the weight calculation here. Make sure the word */ /* count is not zero, avoid a runtime division by zero error. */ if ((count = db.documents) > 0.0) weight = (count - occurrences) / count; } return(weight); } /* findWeight() */ /**************************************************************** * Name: Message * * Synopsis: Displays error message and waits for key. * ****************************************************************/ Message(text message) { text screen; int key; cursoroff(); screen = save(10,13,13,69); box(10,13,13,69,"DS", MenuColor_); puts(11,14,pad(message,'C',53),MenuColor_); if ((key = getkey()) & 255) key = asc(upper(chr(key))); restore(10,13,screen); return(key); } /* Message() */ /**************************************************************** * Name: Path * * Synopsis: Returns the path up to the last \ and without * * the file's name. * ****************************************************************/ Path(text dosPath) { int i, j; if ((i = match(dosPath,":",1)) == 0) i = match(dosPath,"\",1); while(j = match(dosPath,"\",i+1)) i = j; return(substr(dosPath,1,i)); } /* Path() */ /**************************************************************** * Global Variable Declarations and Initialization * ****************************************************************/ int CTRLPGUP = 33792; short LEFT = 19200, RIGHT = 19712, UP = 18432, DOWN = 20480, HOME = 18176, END = 20224, PGUP = 18688, PGDN = 20736, CTRLPGDN = 30208, F2 = 15360, F3 = 15616, F4 = 15872, F5 = 16128, F6 = 16384, F7 = 16640, F8 = 16896, F9 = 17152, F10 = 17408; char ESC = 27, CTRLP = 16, EOF = -1, FALSE = 0, TRUE = 1, CR = 13, LF = 10;