/******************************************************************************
**  The Rochester Connectionist Simulator - a neural network simulator.      **
**  COPYRIGHT (C) 1989  UNIVERSITY OF ROCHESTER.                             **
**                                                                           **
**  This program is free software; you can redistribute it and/or modify it  **
**  under the terms of the GNU General Public License as published by the    **
**  Free Software Foundation; either version 1, or (at your option) any      **
**  later version.                                                           ** 
**                                                                           **
**  This program is distributed in the hope that it will be useful, but      **
**  WITHOUT ANY WARRANTY; without even the implied warranty of               **
**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                     **
**  See the GNU General Public License for more details.                     **
*******************************************************************************/

/* Context-free parsing network.  Written by Mark Fanty, April 1987.
 * Computer Science Department, University of Rochester   
 */

/* This file contains code to build the network.  The user calls
 * 'build', which reads a grammar description from a data file and
 * builds a network which can parse inputs up to the specified length
 * (see University of Rochester TR 164, "Parsing Context-Free Grammars
 * with Connectionist Networks").
 * 
 * Custom buttons are bound so that, in custom mode, a left click over
 * a unit primes it.  This is a convenient way to turn it on.  The
 * middle button is bound so that if clicked over a nonterminal node,
 * it will show an instantiation of that nonterminal.  It does this by
 * darkening other displayed units which are sufficient to satisfy a
 * production of the clicked-over nonterminal and by displaying these
 * units in detail in the info panel.  Repeated clicks over the same
 * unit cycle through all the instantiations.
 *
 * The grammar must be in Chomsky-normal form.  This restriction is
 * not necessary.  Longer productions result in a much larger network,
 * though. 
 */

#include <stdio.h>
#include <ctype.h>
#include <math.h>
#include "sim.h"		/* location of sim.h specified in makefile */
#include "parsing.h"

int Length;			/* maximum input length */
int ntct,tct;			/* nonterminal and terminal count */
static char *Start;		/* start symbol */
static char buffer[40];		/* misc use */

/* Functions defining the behavior of the network.  In uf.c */
int UFmatch();
int UFnt();
int UFt();
int UFsimple();
int UFendmarker();

char *words[2][MAXWORDS];
static int lastword[2] = {0,0};	/* redundant with ntct and tct */

/* used to define the end of a "line"; should not be alpha numeric */
static int terminator = ';';

/* build(argc,argv)
 *
 * The single argument should be the name of a data file which
 * contains the grammar for the network.  This file has the following
 * format:
 *            <max. input length>
 *            <blank separated nonterminal list>;
 *            <start symbol>;
 *            <blank separated terminal list>;
 *            <nt> <nt> <nt>; (type NT production)
 *            <nt> <t>;       (type T production)
 *            etc.
 *
 * Nonterminal units are individually named.  The unit "NT.3.2" is the
 * nonterminal unit which recognizes strings of type NT of length 3,
 * starting in position 2.  For each nonterminal unit there are a
 * number of match units -- one for each instantiation of that nonterminal.
 * These are not named, nor do they appear in the display. 
 */

build(argc,argv)
     int argc;
     char *argv[];
{
  int status,row,col,len1,leftind,firstind,secondind,i;
  FILE *dataf;
  char word[30],*look;
  struct production *prod, *last = NULL,*new = NULL;

  
  /* open data file */
  if(argc != 2)
    {
      printf("usage: call build <data file>\n");
      return;
    }
  dataf = fopen(argv[1],"r");
  if(dataf == NULL)
    {
      printf("build: could not open %s\n",argv[1]);
      return;
    }

  fscanf(dataf,"%d",&Length);
  AllocateUnits(600);		/* more may be necessary */
  
  /* give states symbolic names */
  DeclareState("off",off);
  DeclareState("primed",primed);
  DeclareState("on",on);

  /****************/
  /* read grammar */
  /****************/

  /* read nonterminals */
  for(ntct = 0;ntct < MAXNT;ntct++)
    {
      status = getword(dataf,word);
      if(status == TERMINATOR || status == ENDFILE) break;
      addnt(word);
    }

  /* read start symbol */
  status = findword(dataf,&look);
  if(status != NT)
    {
      printf("expecting start symbol: %s not a nonterminal\n",look);
      return;
    }
  Start = look;
  if(getword(dataf,word) != TERMINATOR)
    {
      printf("expecting terminator after start symbol\n");
      return;
    }

  /* make endmarkers */
  for(i = 1;i <= Length+1;i++)
    {
      makeend(i);
    }

  /* read terminals */
  for(tct = 0;tct < MAXT;tct++)
    {
      status = getword(dataf,word);
      if(status == TERMINATOR || status == ENDFILE) break;
      addt(word);

      /* terminals inhibit endmarker; endmarker is on in positions
	 with no terminals; only the first can have any effect */
      for(i = 1;i <= Length;i++)
	{
	  (void) sprintf(buffer,"$.%1d",i);
	  firstind = find(buffer);
	  (void) sprintf(buffer,"%s.%1d",word,i);
	  secondind = maket(buffer);
	  if(secondind >= 0)
	    MakeLink(secondind,firstind,"inhibit",1000,0,NULL);
	  else
	    printf("addt: cannot find endmarker for column %d\n",i);
	}
    }

  /* read productions */
  for(i = 0;;i++)
    {
      /* read a production */

      last = new;
      new = (struct production *) malloc(sizeof(struct production));
      new->next = NULL;
      if(last == NULL)		/* first one */
	productions = new;
      
      status = findword(dataf,&(new->left));
      if(status == ENDFILE) break; /* no more productions */
      if(last != NULL) last->next = new;

      if(status != NT)
	{
	  printf("left side of production must be nonterm: %s\n",new->left);
	  return;
	}
      
      status = findword(dataf,&(new->first));
      if(status == TERMINATOR)
	{
	  printf("premature end of rule\n");
	  return;
	}
      if(status != NT && status != T)
	{
	  printf("unknown symbol: %s\n",new->first);
	  return;
	}
      new->type = status;
      
      status = findword(dataf,&(new->second));
      if(new->type == T)	/* type T production */
	{
	  if(status != TERMINATOR)
	    {
	      printf("expecting end of line after production\n");
	      return;
	    }
	}
      else if(new->type == NT)
	{
	  if(status == TERMINATOR)
	    {
	      printf("premature termination of type NT production\n");
	      return;
	    }
	  else if(status != NT)
	    {
	      printf("second symbol of type NT production not a nt\n");
	      return;
	    }
	  else if(status = findword(dataf,&(new->second)) != TERMINATOR)
	    {
	      printf("expecting terminator after production\n");
	      return;
	    }
	}
    }		/* end read production loop */
  printf("%d productions\n",i);
  if(i == 0) return;		/* boring grammar */

  /***********************/
  /* process productions */
  /***********************/

  /* echo them */
  for(prod = productions;prod != NULL;prod = prod->next)
    printf("type is %d: %s -> %s %s\n",prod->type,prod->left,prod->first,
	   (prod->type == NT ? prod->second : ""));

  /* all type T productions will be instantiated in row 1, because
     they must be of length one */
  
  for(prod = productions;prod != NULL;prod = prod->next)
    if(prod->type == T)		/* only in row 1; single terminal */
      for(col = 1;col <= Length;col++)
	{
	  (void) sprintf(buffer,"%s.1.%1d",prod->left,col);
	  leftind = makent(buffer);
	  
	  (void) sprintf(buffer,"%s.%1d",prod->first,col);
	  firstind = find(buffer);
	  
	  onelink(leftind,firstind);
	}

  /* type NT productions can occur anywhere from row 2 on */
  
  for(row = 2;row <= Length;row++)
    for(col = 1;col <= Length - (row-1);col++)
      for(prod = productions;prod != NULL;prod = prod->next)
	if(prod->type == NT)
	  /* len1 is length of first symbol's instantiation */
	  for(len1 = 1;len1 < row;len1++)
	    {
	      /* look for nt units corresponding to this production
		 which are of the correct length.  If both are found,
		 make the parent nt and call twolink to make a match
		 unit and connect everything */

	      (void) sprintf(buffer,"%s.%1d.%1d",prod->first,len1,col);
	      firstind = find(buffer);
	      if(firstind < 0) continue;
	      
	      (void) sprintf(buffer,"%s.%1d.%1d",prod->second,row-len1,col+len1);
	      secondind = find(buffer);
	      if(secondind < 0) continue;
	      
	      (void) sprintf(buffer,"%s.%1d.%1d",prod->left,row,col);
	      leftind = makent(buffer);	/* only first call does a makeunit */
	      
	      twolink(leftind,firstind,secondind);
	  }

  /* Make links from endmarkers to the appropriate start symbols.
     These links get the top-down pass started. */
  
  for(row = 1;row <= Length;row++)
    {
      (void) sprintf(buffer,"%s.%1d.1",Start,row);
      if((firstind = find(buffer)) > 0)
	{
	  (void) sprintf(buffer,"$.%1d",row+1);
	  if((secondind = find(buffer)) > 0)
	    MakeLink(secondind,firstind,"top",1000,0,NULL);
	  else
	    printf("missing end marker in column %d\n",row+1);
	}
      else
	printf("no start node (%s) in row %d\n",buffer,row);
    }
} /* end build */

/* The following code provides some simple symbolic capabilities.  The
 * string names of the terminals and nonterminals are there only
 * representation.  These strings are kept in the array 'words';
 * words[T] stores all the terminals; words[NT] stores the
 * nonterminals.  This allows strings read during the processing of
 * productions to be identified.  Nothing is very efficient, but it
 * does not need to be.
 */

/* getword(f,buf)
 *
 * Reads a word from the file descriptor into buf.  If an end of file
 * or terminator character were encountered, buf is not altered.
 * Returns TERMINATOR, ENDFILE or OK.
 */

static int getword(f,buf)
     FILE *f;
     char *buf;
{
  int c,i;

  /* eat space */
  for(c = getc(f);isspace(c) && c != terminator;c = getc(f));

  /* check for end */
  if(c == terminator) return TERMINATOR;
  if(c == EOF) return ENDFILE;

  /* process word */
  for(i = 0;
      isalnum(c) && i < 29;
      c = getc(f),i++) buf[i] = c;
  ungetc(c,f);
  buf[i] = '\0';
  return OK;
}

/* findword(f,wordpt)
 * 
 * Reads a word from the file descriptor and searches for it in the
 * terminal and nonterminal lists.  If it is found, *wordpt is set to
 * point to the stored string and the type is returned.  If it is not
 * found, *wordpt is set to point to a local buffer containing the
 * word and UNKNOWN is returned.  This storage is volatile -- it will
 * change upon the next call to findword.
 */

static int findword(f,wordpt)
     FILE *f;
     char **wordpt;
{
  int c,i,type;
  static char buf[30];

  /* eat space */
  for(c = getc(f);isspace(c) && c != terminator;c = getc(f));

  /* check for end */
  if(c == terminator) return TERMINATOR;
  if(c == EOF) return ENDFILE;

  /* process word */
  for(i = 0;
      isalnum(c) && i < 29;
      c = getc(f),i++) buf[i] = c;
  ungetc(c,f);
  buf[i] = '\0';
  *wordpt =  &buf[0];		/* returned if input not found */

  for(type = 0;type < 2;type++)
    for(i = 0;i < lastword[type];i++)
      {
	if(!strcmp(words[type][i],buf))
	  {
	    *wordpt = words[type][i];
	    return type;
	  }
      }
  return UNKNOWN;
}

/* enterword(word,type)
 *
 * Enters 'word' in the T or NT table, depending of the value of
 * 'type'.  The value of 'word' is copied into newly allocated storage.
 */
 
static int enterword(word,type)
     char *word;
     int type;
{
  if(lastword[type] >= MAXWORDS)
    {
      printf("enterword: word table overflow\n");
      return;
    }
  words[type][lastword[type]++] = strcpy(malloc(strlen(word)+1),word);
}

/* addnt(word), addt(word)
 *
 * Add a nonterminal or terminal to the appropriate table. 
 */

static addnt(word)
     char * word;
{
  enterword(word,NT);
}

static addt(word)
     char * word;
{
  enterword(word,T);
}
      
/*
 * Network building
 */

static int maket(name)		/* make a terminal */
     char *name;
{
  int ind;
  
  ind = MakeUnit("termtype",UFt,0,0,0,0,0,0);
  SetFlag(ind,NO_LINK_FUNC_FLAG);
  AddSite(ind,"top",SFmax,0);
  NameUnit(name,SCALAR,ind,1,1);
  return ind;
}

static int makent(name)		/* make a nonterminal */
     char *name;
{
  int ind;

  if((ind = NameToInd(name,1,1)) >= 0) return ind;
  ind = MakeUnit("nontermtype",UFnt,0,0,0,0,0,0);
  SetFlag(ind,NO_LINK_FUNC_FLAG);
  AddSite(ind,"top",SFmax,0);
  AddSite(ind,"bottom",SFsum,0);
  NameUnit(name,SCALAR,ind,1,1);
  return ind;
}

static int makeend(pos)		/* make an endmarker */
     int pos;
{
  int ind;
  
  ind = MakeUnit("endmarker",UFendmarker,0,0,0,0,0,0);
  AddSite(ind,"inhibit",SFsum,0);
  (void) sprintf(buffer,"$.%1d",pos);
  NameUnit(buffer,SCALAR,ind,1,1);
}

static int find(name)		/* find the named unit */
     char *name;
{
  return NameToInd(name,1,1);	
}

/* onelink(left,first)
 * 
 * The unit 'first' instantiates a production of 'left'.  Make a match
 * unit and connect them.
 */

static int onelink(left,first)
     int left,first;
{
  int matchind;

  matchind = MakeUnit("match",UFmatch,0,0,0,0,0,0);
  AddSite(matchind,"bottom",SFmin,0);
  AddSite(matchind,"top",SFsum,0);
  MakeLink(first,matchind,"bottom",1000,0,NULL);
  MakeLink(matchind,left,"bottom",1000,0,NULL);
  MakeLink(left,matchind,"top",1000,0,NULL);
  MakeLink(matchind,first,"top",1000,0,NULL);
}

/* twolink(left,first,second)
 * 
 * The units 'first' and 'second' instantiate a production of 'left'.
 * Make a match unit and connect them.
 */

static int twolink(left,first,second)
     int left,first,second;
{
  int matchind;

  matchind = MakeUnit("match",UFmatch,0,0,0,0,0,0);
  AddSite(matchind,"bottom",SFmin,0);
  AddSite(matchind,"top",SFsum,0);

  MakeLink(second,matchind,"bottom",1000,0,NULL);
  MakeLink(first,matchind,"bottom",1000,0,NULL);
  MakeLink(matchind,left,"bottom",1000,0,NULL);

  MakeLink(left,matchind,"top",1000,0,NULL);
  MakeLink(matchind,first,"top",1000,0,NULL);
  MakeLink(matchind,second,"top",1000,0,NULL);
}
