import torchimport pandas as pdfrom tqdm.notebook import tqdm# by specifying names, we have a more intuitive sense of what we are dealing withdf = pd.read_csv('Data/smile-annotations-final.csv', names=['id', 'text', 'category'])# by setting the index is to convenient future operationsdf.set_index('id', inplace=True)# analyze the column datadf.category.value_counts()# preprocessingdf = df[-df["category"].str.contains("\|")]df = df[df.category!="nocode"]# feature engineering, convert to the format we are comfortable working withlabel_dict ={}for index, label inenumerate(df.category.unique()): label_dict[label]= indexdf['label']= df.category.replace(label_dict)