/*-----------------------------------------------------------------------------

   QUASAR - q-gram Alignment based on Suffix ARrays

   Copyright (C) 1998 Stefan Burkhardt
   Author: Stefan Burkhardt <stburk@mpi-sb.mpg.de>
   This file is part of the QUASAR package.

   QUASAR is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   QUASAR is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the QUASAR package; see the file copying.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  or contact the author. 

-------------------------------------------------------------------------------

  suffix array construction tool
  
  $File$
  $Revision: 1.2 $
  $Date: Wed, 29 Mar 2000 11:07:45 +0200 $


  This program generates the suffix array for a set of sequences 
  in an .sset file.

  call syntax: 
  q_sa sset_filename q

  where sset_filename is the name of the .sset file WITHOUT the
  extension and q is a number between 0 and 12 that defines
  the length of the q-grams used for the bin_sort step. 1<<(2*q)
  bins are created, then an in-place bin sort is conducted and the
  full bins are then processed with qsort. If one chooses q=0,
  the old version that simply does a qsort on the whole db is used.
  
Sample running times for different db sizes and q (in seconds)
q =	0	1	2	3	4	6	8	10	12
  5 KB  0.04	0.03	0.04	0.03	0.05	0.33	7.78	BIG	BIG
  4 MB  49.3	49.7	46.9	44.1	43.9	34.2	30.1	32.7	BIG
 25 MB  878	933	1071	554	732	1040	1039	982	
 87 MB	8438	10481	14103	12845				14241
213 MB  64249
650 MB  > 5days  
-----------------------------------------------------------------------------*/

#include "q_sa.h"

int		q;	/* choose something between 0 and 12 */
Q_SSet		*db;	/* THIS HAS TO BE GLOBAL FOR COMPARE */

int CompareSuffixes(const void *s1, const void *s2)
{
  return strcmp(db->data+*((int *)s1), db->data+*((int *)s2));
}

int FastCompareSuffixes(const void *s1, const void *s2)
{
  if((*((int *)s1)) < db->offsets[db->size] - q)
    return strcmp(db->data+q+*((int *)s1), db->data+q+*((int *)s2));
  else
    return strcmp(db->data+*((int *)s1), db->data+*((int *)s2));
}

int s2int(char *sequence)
{
  int i;
  int qgram = 0;
  for(i=0; i<q; i++) {
    switch (sequence[i]) {
    case '\0': 
      qgram = qgram << (2 * (q-i-1));
      return qgram;
    case 'C':
      qgram++;
      break;
    case 'G':
      qgram+=2;
      break;
    case 'T':
      qgram+=3;
      break;
    }
    qgram = qgram << 2;
  }
  qgram = qgram >> 2;
#ifdef DEBUG
  printf("sequence: %30.30s (%d)\n", sequence, qgram);
  getchar();
#endif
  return qgram;
}

void CheckSa(int *sa, Q_SSet *db)
{
  int	i, error_ct = 0;
  printf("checking suffix array\n");
  for(i=0; i<db->offsets[db->size]-1; i++) {
    if(strcmp(db->data+sa[i], db->data+sa[i+1]) >0)
      error_ct++;
  }
  printf("Number of strcmp errors: %d\n", error_ct);
}

void PrintSa(int *sa, Q_SSet *db)
{
  int	i;
  for(i=0; i<db->offsets[db->size]; i++) {
    if(i%100 == 99)
      getchar();
    printf("%8d: %8d -> %20.20s\n", i, sa[i], db->data+sa[i]);
  }
  printf("Address of db->data: %d\n", (int)db->data);
}


/* MAIN PROGRAM  */
int main (int argc, char *argv[])
{
  int		i, j;
  int		tmp, tmp1;
  int		*sa, *hits, *b_off;
  char	        buffer[1000];	/* CHANGE!!! DYNAMIC ALLOCATION WOU"LD BE BETTER */
  
  FILE		*outfile;

  printf("Suffix array construction $Revision: 1.2 $\n");
  printf("by Stefan Burkhardt\n");
  
  if(argc != 3) {
    printf("ERROR: call syntax of q_sa is:\n");
    printf("q_sa database_name bin_size\n");
    printf("where database_name is the file name of a database sset without the .sset extension and bin_size defines the level of bin_sort perforemd (0 to 12)\n");
  }

  /* OPEN OUTPUTFILE */
  sprintf(buffer,"%s.sa",argv[1]);
  outfile = MyFopen(buffer, "w");

  q = atoi(argv[2]);
  if(q<0 || q>12) {
    printf("PLEASE CHOOSE bin_size BETWEEN 0 and 12!!!\n");
    exit (-1);
  }

  /* READ REQUIRED DATA FROM DISK */
  db = ReadSSet(argv[1]);

  /* INITIALIZE SUFFIX ARRAY */
  sa = (int *)MyMalloc(sizeof(long) * db->offsets[db->size], "suffix array");
  for(i=0; i<db->offsets[db->size]; i++) {
    sa[i] = i;
  }
  
  printf("Done Reading Data and Initializing\n");
  
  if(q == 0) {
    /* SORT THE POINTERS IN THE SUFFIX ARRAY */
    printf("Starting Simple Construction\n");
    qsort((void *)sa, db->offsets[db->size], sizeof(int), CompareSuffixes);
  }
  else {
    printf("Starting construction based on bin_sort of %d-grams\n", q);
    hits = (int *)MyMalloc(sizeof(long) * (1+(1<<(2*q))), "hit count array");
    for(i=0; i<(1<<(2*q)); i++) {
      hits[i]=0;
    }  
    b_off = (int *)MyMalloc(sizeof(long) * (1<<(2*q)), "hit count array");
    for(i=0; i<(1<<(2*q)); i++) {
      b_off[i]=0;
    }  
    for(i=0; i<db->offsets[db->size]; i++) {
      hits[s2int(db->data+sa[i])]++;
    }
    tmp = hits[0];
    hits[0] = 0;
    for(i=1; i<=(1<<(2*q)); i++) {
      tmp1 = hits[i];
      hits[i]=hits[i-1] + tmp;
      tmp = tmp1;
    }  
#ifdef DEBUG
    for(i=0; i<=(1<<(2*q)); i++) {
      printf("%d: %d hits\n", i, hits[i]);
    }  
#endif
    
    for(i=0; i<(1<<(2*q)); i++) {
      j = hits[i]+b_off[i];
      while (j<hits[i+1]) {
	tmp = s2int(db->data+sa[j]);
	if(tmp == i)
	  j++;
	else {
	  tmp1 = sa[hits[tmp] + b_off[tmp]];
	  sa[hits[tmp] + b_off[tmp]] = sa[j];
	  sa[j] = tmp1;
	  b_off[tmp]++;
	}
      }
#ifndef DEBUG
      printf("Sorting %d entries for tuple %d\n", hits[i+1] - hits[i], i);
#endif
      /*
      qsort((void *)&sa[hits[i]], hits[i+1]-hits[i], sizeof(int), FastCompareSuffixes);
      */
    }
    free(hits);
    free(b_off);
  }

    
#ifdef DEBUG
  CheckSa(sa, db);
#endif

#ifdef DEBUG
  PrintSa(sa, db);
#endif
  
  /* WRITE THE SUFFIX ARRAY TO THE OUTPUT FILE */
  printf("Writing Suffix Array with %d elements of size %d to %s\n", (size_t)db->offsets[db->size], sizeof(int), buffer);
  for(i=0; i<db->offsets[db->size]; i++) {
    fwrite((void *) &sa[i],sizeof(int),1,outfile);
  }
  
  /* CLEAN UP */
  free(sa);
  DeleteSSet(db);
  fclose(outfile);
  return 1;
}
