Duplicate entry removal


1 Answer(s)


If you want to process Excel Spreadsheets you need a Java library like Apache POI or docx4j. Merge both sheets into one and apply this code(below) to remove duplicate entries.

You could also look into converting the spreadsheet to CSV format and then parse that to remove duplicates etc.

I am using Apache POI

######Code#####
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;


/**
* TODO To change the template for this generated type comment go to Window -
* Preferences - Java - Code Style - Code Templates
*/
public class RemoveDuplicates {

/**
*
*/
public RemoveDuplicates() {
super();
// TODO Auto-generated constructor stub
}

public static void main(String[] args) {
HSSFWorkbook workBook = null;
POIFSFileSystem fs = null;
HSSFSheet sheet = null;
try {
fs = new POIFSFileSystem(new FileInputStream(args[0]));
workBook = new HSSFWorkbook(fs);
sheet = workBook.getSheetAt(0);
int rows = sheet.getPhysicalNumberOfRows();
System.out.println(rows);
Set s=new HashSet();
String str="";
for (int i = 0; i < rows; i++) {
str="";
HSSFRow row = sheet.getRow((short) i);
int columns = row.getPhysicalNumberOfCells();
for (int j = 0; j < columns; ++j) {
HSSFCell cell0 = row.getCell((short) j);
int type=cell0.getCellType();
if(type==0){
double intValue= cell0.getNumericCellValue();
str=str+String.valueOf(intValue)+",";
}else if(type==1){
String stringValue=cell0.getStringCellValue();
str=str+stringValue+",";
}

}
str=str
.replace(str.charAt(str
.lastIndexOf(",")), ' ');
s.add(str.trim());
}
StringTokenizer st=null;
String result="";
Iterator iter=s.iterator();

//Create a new workbook for the output excel
HSSFWorkbook workBookOut = new HSSFWorkbook();

//Create a new Sheet in the output excel workbook
HSSFSheet sheetOut = workBookOut.createSheet("Remove Duplicates");
HSSFRow[] row = new HSSFRow[s.size()];
int rowCount=0;
int cellCount=0;
while(iter.hasNext()){
cellCount=0;
row[rowCount] = sheetOut.createRow(rowCount);
result=iter.next().toString();
System.out.println(result);
st=new StringTokenizer(result," ");
HSSFCell[] cell= new HSSFCell[st.countTokens()];
while(st.hasMoreTokens()){
cell[cellCount]=row[rowCount].createCell((short)cellCount);
cell[cellCount].setCellValue(st.nextToken());
++cellCount;
}
++rowCount;
}
FileOutputStream fileOut = new FileOutputStream(args[1]);
workBookOut.write(fileOut);
fileOut.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}}