My seniors ask for help to remove redundant duplicate data points. Here is a python program to solve this problem.
Release 1.1
- Greatly improved computational efficiency
Release 1.2
- Add the ability to export similar data separately as a table
try :
import pandas as pd
times = 0
print('****************** RDP v1.2 **********************')
print(' s20180318@xs.ustb.edu.cn')
print(' Unvi. of science & technology Beijing')
print(' Copyright © 2019, lixin.fun. All rights reserved.')
print('**************************************************')
print('Please input file (.xls or .xlsx) that you want to Remove duplicate points:')
fileName = input()
print('Please set the output file name, and set .csv as a suffix (e.g. out.csv):')
outName = input()
print('Please set the deleted output file name, and set .csv as a suffix (e.g. out.csv):')
outName2 = input()
print('Please set the filter record file name, and set .txt as a suffix (e.g. out.txt):')
outName3 = input()
dictr = dict()
numb = []
numb2 = []
df=pd.read_excel(fileName)
rows = df.shape[0]
#columns = df.shape[1] # columns
i = 0
while i < rows - 1:
data=df.iloc[i].values # It doesn't include a header, and starts from zero
j = i + 1
k = 0
while j > i and j <= rows - 1:
data2=df.iloc[j].values
comp = abs((data[3] - data2[3])/ data[3]) # Comparison range
comp2 = abs((data[4] - data2[4])/ data[4])
comp3 = abs((data[9] - data2[9])/ data[9])
comp4 = abs((data[10] - data2[10])/ data[10])
comp5 = abs((data[11] - data2[11])/ data[11])
comp6 = abs((data[12] - data2[12])/ data[12])
comp7 = abs((data[13] - data2[13])/ data[13])
comp8 = abs((data[16] - data2[16])/ data[16])
comp9 = abs((data[44] - data2[44])/ data[44])
if comp >= 0 and comp <= 0.06 :
if comp2 >= 0 and comp2 <= 0.06 :
if comp3 >= 0 and comp3 <= 0.06 :
if comp4 >= 0 and comp4 <= 0.06 :
if comp5 >= 0 and comp5 <= 0.06 :
if comp6 >= 0 and comp6 <= 0.06 :
if comp7 >= 0 and comp7 <= 0.06 :
if comp8 >= 0 and comp8 <= 0.06 :
if comp9 >= 0 and comp9 <= 0.06 :
times = times + 1
print(times)
dictr[data2[0]] = data[0]
print(str(data[0]) +","+ str(data2[0])+"\n")
numb.append(j)
numb2.append(i)
numb2.append(j)
j = j + 1
i = i + 1
dfl = df.iloc[numb2]
dfl.to_csv(outName2,index=False)
df.drop(numb,inplace=True) # Delete a row
df.to_csv(outName,index=False) # Output
print('There are ' + str(times) +' data deleted.')
print(dictr)
f = open(outName3,'w')
f.write(str(dictr))
f.close()
print('************* Mission Completed!******************')
except :
print('*****************Error Message********************')
print('Plase check your input and run this program again!')
print('**************************************************')
How to use
Linux (root) : python3 RDPv1.py
Comments