/*
Name

  cryptogrep

Description

   Searches a file for "hidden" strings composed of alphabetic
   characters, possibly separated by non-alpha characters, and
   prints the number of occurrences found.  This program assumes
   that the input file does not contain the ascii nul character
   ('\0').

Author

   Ray Ontko  February 15, 1999

Notes

This program has a few limitations:

   1) It assumes that the search string as entered does not 
      contain any non-alphabetic characters.

   2) It assumes that the null character ('\0') does not occur
      in the input file.

This program has 3 subprograms included below in this source file:

   cryptoscan, 
     which scans a block of text for a search string,

   cryptocount, 
     which counts the number of occurrences of a search
     string in a block, and

   last_n_alphas,
     which returns the starting point of the last n 
     alphabetic characters in a block.  This is used to
     help figure out which portion of a block should be 
     saved and contatenated with a succeeding block so that
     block-spanning target strings are not missed.

All other routines used are part of the C standard library.
In particular:

   strcpy, strlen, tolower, isalpha, 
   perror, fprintf, printf, putchar,
   fopen, fread, fclose, exit.

*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

/* 
#define debug
*/

#define PROG_NAME "cryptogrep"
#define BUF_SIZE 1000

int cryptoscan( char *buf , char *s , int begin , int *offset , int *length ) ;
void cryptocount( char *buf , char *search_string , int *found_count ) ;
int last_n_alphas( char *buf , int len , int *start ) ;

void main( int argc , char *argv[] )
{
char *search_string ;  /* our search string */
char *input_file_name ;  /* our input file name */
FILE *input_file ;  /* an input file */
char buf[BUF_SIZE+1] ;  /* a buffer */
int siz ;  /* number of bytes read into buffer */
int p ;  /* current search offset within the buffer */
int i ;
int found_count ;
int preset ;
int last ;

/* process the command line */
if( (argc < 2) || (argc > 3) )
  {
  fprintf( stderr , "Usage: %s <search-string> <input-file>\n" , 
    PROG_NAME ) ;
  exit( EXIT_FAILURE ) ;
  }
search_string = argv[1] ;
input_file_name = argv[2] ;

/* open the input file */
if( ( input_file = fopen( input_file_name , "r" ) ) == NULL )
  {
  fprintf( stderr , "%s: Unable to open input file \"%s\"\n" , 
    PROG_NAME , input_file_name ) ;
  perror( PROG_NAME ) ;
  exit( EXIT_FAILURE ) ;
  }

/* process the file */
found_count = 0 ;
preset = 0 ;
while( ( siz = fread( buf+preset , 1 , BUF_SIZE - preset , input_file ) ) > 0 )
  {
  buf[preset+siz] = '\0' ;
  cryptocount( buf , search_string , &found_count ) ;

  /* 
  if there are at least n-1 alphas in the block, find their
  position, copy them to the beginning of the buffer, and set things
  up so that next time we read, we append to the "tail" of the 
  previous block which is now at the beginning of the buffer 
  */
  if( last_n_alphas( buf , strlen(search_string) - 1 , &last ) >= 0 )
    {
    strcpy( buf , buf+last ) ;
#ifdef debug
    printf( "excess \"%s\"\n" , buf ) ;
#endif
    preset = preset + siz - last ;
    }
  else
    preset = 0 ;
  }

/* close, print, exit */
fclose( input_file ) ;
printf( "%d occurrences of \"%s\" found.\n" , found_count , search_string ) ;
exit( EXIT_SUCCESS ) ;
}

int cryptoscan( char *buf , char *s , int begin , int *offset , int *length )
{
/*
Scans "buf" for the characters of "s", starting at "begin", and
returns the "offset" of the first character, and the "length" of 
the string containing the the first through last characters.  

The return-value for the function is the offset if found, or -1 
if not found.

Note that the characters in buf are possibly separated by   
non-alpha characters.  These are ignored for the purposes
of searching.

Both "buf" and "s" are assumed to be null-terminated strings.

"s" is assumed to contain only alpha characters.
*/

int i ;     /* index in buf */
int j ;     /* index in s */
int start ; /* temporary variable that holds the tentative start
               for a string in buf */

i = begin ; 
start = i ;
j = 0 ;
while ( 1 )
  {
  /* if we're at the end of the search string, stop */
  if ( s[j] == '\0' )
    break ;

  /* ignore non-alpha chars */
  while ( ! ( isalpha( buf[i] ) || buf[i] == '\0' ) )
    i ++ ;

  /* if we're at the end of the buffer, stop */
  if ( buf[i] == '\0' )
    break ;

#ifdef debug
  printf( "buf[%d] %c  s[%d] %c\n" , i , buf[i] , j , s[j] ) ;
#endif

  /* compare the current (alphabetic characters in each string) */
  if ( tolower( buf[i] ) == tolower( s[j] ) )
    {
    /* save our starting point if this is the first character */
    if ( j == 0 )
      start = i ;
    /* increment both counters and continue searching */
    i ++ ; 
    j ++ ;
    }
  else
    {
    /* restart searching at the character following our previous start */
    i = start + 1 ;
    j = 0 ;
    start = i ;
    }
  }

/* 
if j incremented to the length of the search string, 
then the whole string was found and we return offset and length
otherwise we return not found (-1).
*/
if ( j == strlen( s ) )
  {
  *offset = start ;
  *length = i - start ;
  return( start ) ;
  }
else
  return( -1 ) ;
}

void cryptocount( char *buf , char *search_string , int *found_count )
/*
repeatedly calls cryptoscan to search a buffer (buf)
for a search_string, and increments found_count for
each occurrence.  Note that found_count is not 
initialized by this routine, only incremented;
before calling this routine for the first time, you
should initialize found_count.
*/
{
int p ;  /* current search position */
int found ;  /* where found, relative to the beginning of the buffer */
int length ;  /* length of found string, including non-alpha characters */

p = 0 ;
while( 1 )
  {
  p = cryptoscan( buf , search_string , p  , &found , &length ) ;
  if( p > 0 )
    {
#ifdef debug
    printf( "found at offset %d  length %d  \"" , found , length ) ;
    for( i = found ; i < found + length ; i ++ )
      putchar( buf[i] ) ;
    putchar( '\"' ) ;
    putchar( '\n' ) ;
#endif
    (*found_count) ++ ;
    p = found + length ;
    }
  else
    break ;
  }
}

int last_n_alphas( char *buf , int len , int *start )
/* 
this routine scans backwards from the end of a buffer "buf"
looking for "len" alpha characters, and returns as "start"
the position of the first character in the sequence.  If the
desired number of letters isn't found, -1 is returned by 
the function.
*/
{
int i ;
int count ;
int siz ;

count = 0 ;
siz = strlen( buf ) ;
for( i = siz - 1 ; i >= 0 ; i -- )
  {
  if( count >= len )
    break ;
  if( isalpha( buf[i] ) )
    count ++ ;
  }

if( i >= 0 )
  {
  *start = i ;
  return( i ) ;
  }
else
  return( -1 ) ;
}
