/*
 * Original coded: 2003/12/04 by abe
 *
 */

/*
 * LICENSE:
 * 
 *     Copyright (C) 2006 Hidenao Abe (COIN Project)
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License along
 *    with this program; if not, write to the Free Software Foundation, Inc.,
 *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 */
package coin.dataset_loader;

import java.util.StringTokenizer;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.FileNotFoundException;
import java.io.IOException;

import coin.Attribute;
import coin.Instance;

/**
 * @author Hidenao Abe  (hidenao@users.sourceforge.jp)
 * This Class is a loader to load a data set formated with C4.5 format(.names and .data/.test)
 *
 */

public class C45DatasetLoader extends DatasetLoader {

	private static final String UNKNOWN_VALUE="?";
	
	public C45DatasetLoader(String dset_name){
		
		super();
	
		this.commentChar='|';
		this.fieldDelimiter=",";
		
		String names_file;
		String data_file;
		StringBuffer strb;
		
		// remove extension from input string
		strb = new StringBuffer();
		if(dset_name.matches(".*names")){
			strb.append(dset_name.substring(0,dset_name.indexOf(".names")));
		}
		else if(dset_name.matches(".*data")){
			strb.append(dset_name.substring(0,dset_name.indexOf(".data")));
		}
		else if(dset_name.matches(".*test")){
			strb.append(dset_name.substring(0,dset_name.indexOf(".test")));
		}
		else{
		}
		
		
		//System.out.println(strb.toString());
		
		// read names(=attribute information)
		names_file = strb.toString()+".names";
		//System.out.println("Names File: "+names_file);
		readAttributes(names_file);
		
		// read dataset
		if(dset_name.matches(".*data")){
			data_file = strb.toString()+".data";
		}
		else if(dset_name.matches(".*test")){
			data_file = strb.toString()+".test";
		}
		else{
			data_file = strb.toString()+".data";
		}
		//System.out.println("Data File:"+data_file);
		readInstances(data_file);
		
		// set dataset's(relation) name to instances and attributeInfo
		this.instances.setName(strb.toString());
		this.attributeInfo.setName(strb.toString());
		
		// set attribute information to the data set
		this.instances.setAttributeInfo(this.attributeInfo);
		
		// set class index
		this.attributeInfo.setClassIndex(this.attributeInfo.getNumAttributes() -1); 
		
		// get max/min values on each attributes
		this.setMaxMinValues();
	}
	
	private void readAttributes(String filename){
		
		String line=null,classLine=null;
		Attribute classAtt;
		
		try{
			InputStreamReader fr = new InputStreamReader(new FileInputStream(filename),"JISAutoDetect");
			BufferedReader br = new BufferedReader(fr);
			
			line = br.readLine();
			boolean isClass=true;

			while(line != null && line.length() > 0){
				// skip this line, if the first character is commentChar
				int i=0;
				while(Character.isSpaceChar(line.charAt(i))){
					i++;
				}
				if(line.charAt(i)!=commentChar){
					if(!isClass && line.matches(".*:.*")){
						attributeInfo.addAttribute(readAttribute(line));
					}
					else{
						StringBuffer strb = new StringBuffer();
						int j;
						for(j=0; j<line.length(); j++){
							if(line.charAt(j)=='|'){
								break;
							}
							else if(line.charAt(j) !='\t' && line.charAt(j)!= '.' && !Character.isSpaceChar(line.charAt(j))){
								strb.append(line.charAt(j));
							}
						}
						classLine = strb.toString();
						isClass = false;
					}
				}
				line = br.readLine();
				while(line != null && line.length()==0){
					line = br.readLine();
				}
			}
		}
		catch(FileNotFoundException fne){
			fne.printStackTrace();
		}
		catch(IOException ioe){
			ioe.printStackTrace();
		}
		
		//System.out.println(classLine);
		// append class information
		classAtt = new Attribute();
		classAtt.setName("class");
		classAtt.setNominalValues(classLine,fieldDelimiter);
		classAtt.setType(Attribute.NOMINAL);
		
		attributeInfo.addAttribute(classAtt);
		
		// initialize attribute id
		attributeInfo.initAttributeID();
	}
	
	private Attribute readAttribute(String line){
		
		Attribute result;
		StringBuffer name_buf,value_buf;
		int i;
		char c;
		boolean isValue=false;
		
		result = new Attribute();
		name_buf = new StringBuffer();
		value_buf = new StringBuffer();
	
		//System.out.println(line);
	
		for(i=0; i<line.length(); i++){
			c = line.charAt(i);
			//System.out.println(c);
			//System.out.println("isValue:"+Boolean.toString(isValue));
			if(Character.isSpaceChar(c)){
				continue;
			}
			else if((! isValue) && c!=':'){
				name_buf.append(c);
			}
			else if((! isValue) && c==':'){
				//System.out.println("name_buf: "+name_buf.toString());
				result.setName(name_buf.toString());
				isValue = true;
			}
			else if(isValue && c != '\t' && !Character.isSpaceChar(c) && c != '.'){
				value_buf.append(c);
			} // ignore white space and the last dot
			else{
				continue;
			}
		}
		
		if(value_buf.toString().equals("continuous")){
			result.setType(Attribute.NUMERIC);
		}
		else if(value_buf.toString().equals("ignore") || value_buf.toString().equals("Ignore")){
			result.setType(Attribute.IGNORE);
		}
		else{
			//System.out.println("value:"+value_buf.toString());
			result.setType(Attribute.NOMINAL);
			result.setNominalValues(value_buf.toString(), fieldDelimiter);
		}
		
		return result;
	}
	
	private void readInstances(String filename){
		
		String line=null;
		int lines=0;
		
		try{
			InputStreamReader fr = new InputStreamReader(new FileInputStream(filename), "JISAutoDetect");
			BufferedReader br = new BufferedReader(fr);
			
			line = br.readLine();
			while(line != null){
				// skip this line, if the first character is commentChar
				int i=0;
				while(Character.isSpaceChar(line.charAt(i))){
					i++;
				}
				if(line.charAt(i)!=commentChar){
					if(! line.equals("\n")){
						instances.addInstance(readInstance(line));
						lines++;
					}
					else{
						continue;
					}
				}
				line = br.readLine();
			}
		}
		catch(FileNotFoundException fne){
			fne.printStackTrace();
		}
		catch(IOException ioe){
			ioe.printStackTrace();
		}
		
	}
	
	private Instance readInstance(String line){
		
		Instance instance;
		
		instance = new Instance();
		
		StringTokenizer strt;
		strt = new StringTokenizer(line,fieldDelimiter);
		
		int index=0;
		while(strt.hasMoreTokens()){
			String token = strt.nextToken();
			if(token.equals(C45DatasetLoader.UNKNOWN_VALUE)){
				instance.addValue(Instance.UNKNOWN_VALUE);
			}
			else{
				instance.addValue(token);
				// count number of each nominal value
				if(attributeInfo.getAttribute(index).getType() == Attribute.NOMINAL){
					int t=attributeInfo.getAttribute(index).getNumOfNominalValue(token);
					t++;
					attributeInfo.getAttribute(index).setNumOfNominalValue(token,t);
				}
			}
			index++;
		}
		
		return instance;
	}

	private void setMaxMinValues(){
		
		int i=0;
		for(i=0; i<attributeInfo.getNumAttributes(); i++){
			Attribute att = attributeInfo.getAttribute(i);
			
			if(att.getType() == Attribute.NUMERIC){
				int j=0;
				double max=Double.MIN_VALUE;
				double min=Double.MAX_VALUE;
				for(j=0; j<instances.size(); j++){
					Instance inst = instances.getInstance(j);
					String value = inst.getValueAt(i);
					if(! value.equals(Instance.UNKNOWN_VALUE)){
						double t_value = Double.valueOf(value).doubleValue();
						if(max < t_value){
							max = t_value;
						}
						else if(min > t_value){
							min = t_value;
						}
						else{ continue; }
					}
				}
				att.setMaxValue(Float.toString((float)max));
				att.setMinValue(Float.toString((float)min));
			}
		}
		
		
		
	}
	
	public String toString(){
		
		StringBuffer result_buf = new StringBuffer();
		
		result_buf.append(attributeInfo.toString());
		//instances.print();
		
		return result_buf.toString();
	}
	
	public static void main(String[] args){
		
		try{
			C45DatasetLoader dset = new C45DatasetLoader(args[0]);
			System.out.println(dset.toString());
		}
		catch(Exception e){
			e.printStackTrace();
		}
		
	}

}
