DATA_PATH = ‘../input/shopee-product-matching/‘ #导入相关数据包 import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import cv2, matplotlib.pyplot as plt from tqdm import tqdm_notebook #导入gpu加速包 import cudf, cuml, cupy from cuml.feature_extraction.text import TfidfVectorizer from cuml.neighbors import NearestNeighbors #计算F1 def getMetric(col): def f1score(row): n = len( np.intersect1d(row.target,row[col]) ) return 2*n / (len(row.target)+len(row[col])) return f1score
#读取文件信息 COMPUTE_CV = True test = pd.read_csv(DATA_PATH + ‘test.csv‘) if len(test)>3: COMPUTE_CV = False else: print(‘this submission notebook will compute CV score, but commit notebook will not‘) # COMPUTE_CV = False if COMPUTE_CV: train = pd.read_csv(DATA_PATH + ‘train.csv‘) train[‘image‘] = DATA_PATH + ‘train_images/‘ + train[‘image‘] tmp = train.groupby(‘label_group‘).posting_id.agg(‘unique‘).to_dict() train[‘target‘] = train.label_group.map(tmp) train_gf = cudf.read_csv(DATA_PATH + ‘train.csv‘) else: train = pd.read_csv(DATA_PATH + ‘test.csv‘) train[‘image‘] = DATA_PATH + ‘test_images/‘ + train[‘image‘] train_gf = cudf.read_csv(DATA_PATH + ‘test.csv‘)
#对原数据的hash进行处理 tmp = train.groupby(‘image_phash‘).posting_id.agg(‘unique‘).to_dict() train[‘oof_hash‘] = train.image_phash.map(tmp) if COMPUTE_CV: train[‘f1‘] = train.apply(getMetric(‘oof_hash‘),axis=1) print(‘CV score for baseline =‘,train.f1.mean())
#基于resnet18搭建cnn模型 from PIL import Image import torch torch.manual_seed(0) torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True import torchvision.models as models import torchvision.transforms as transforms import torchvision.datasets as datasets import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.autograd import Variable from torch.utils.data.dataset import Dataset class ShopeeImageDataset(Dataset): def __init__(self, img_path, transform): self.img_path = img_path self.transform = transform def __getitem__(self, index): img = Image.open(self.img_path[index]).convert(‘RGB‘) img = self.transform(img) return img def __len__(self): return len(self.img_path) imagedataset = ShopeeImageDataset( train[‘image‘].values, transforms.Compose([ transforms.Resize((512, 512)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ])) imageloader = torch.utils.data.DataLoader( imagedataset, batch_size=10, shuffle=False, num_workers=2 ) class ShopeeImageEmbeddingNet(nn.Module): def __init__(self): super(ShopeeImageEmbeddingNet, self).__init__() model = models.resnet18(True) model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1)) model = nn.Sequential(*list(model.children())[:-1]) model.eval() self.model = model def forward(self, img): out = self.model(img) return out DEVICE = ‘cuda‘ imgmodel = ShopeeImageEmbeddingNet() print(imgmodel) imgmodel = imgmodel.to(DEVICE) imagefeat = [] with torch.no_grad(): for data in tqdm_notebook(imageloader): data = data.to(DEVICE) feat = imgmodel(data) feat = feat.reshape(feat.shape[0], feat.shape[1]) feat = feat.data.cpu().numpy() imagefeat.append(feat)
#归一化处理 from sklearn.preprocessing import normalize imagefeat = np.vstack(imagefeat) imagefeat = normalize(imagefeat)
#计算图片之间的相似度,筛选出大于0.95的id
preds = [] CHUNK = 1024*4 imagefeat = cupy.array(imagefeat) print(‘Finding similar images...‘) CTS = len(imagefeat)//CHUNK print(CTS) if len(imagefeat)%CHUNK!=0: CTS += 1 for j in range( CTS ): a = j*CHUNK b = (j+1)*CHUNK b = min(b, len(imagefeat)) print(‘chunk‘,a,‘to‘,b) distances = cupy.matmul(imagefeat, imagefeat[a:b].T).T # distances = np.dot(imagefeat[a:b,], imagefeat.T) for k in range(b-a): IDX = cupy.where(distances[k,]>0.95)[0] # IDX = np.where(distances[k,]>0.95)[0][:] o = train.iloc[cupy.asnumpy(IDX)].posting_id.values preds.append(o)
print((distances[1,])) print((distances[0,]>0.84)) cupy.where(distances[k,]>0.9) cupy.where(distances[k,]>0.95_ print(preds) train[‘oof_cnn‘] = preds if COMPUTE_CV: train[‘f1‘] = train.apply(getMetric(‘oof_cnn‘),axis=1) print(‘CV score for baseline =‘,train.f1.mean())
#对文本数据计算相似度并筛选出大于0.7的id model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000) text_embeddings = model.fit_transform(train_gf.title).toarray() print(‘text embeddings shape‘,text_embeddings.shape) preds = [] CHUNK = 1024*4 print(‘Finding similar titles...‘) CTS = len(train)//CHUNK if len(train)%CHUNK!=0: CTS += 1 for j in range( CTS ): a = j*CHUNK b = (j+1)*CHUNK b = min(b,len(train)) print(‘chunk‘,a,‘to‘,b) # COSINE SIMILARITY DISTANCE # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T for k in range(b-a): # IDX = np.where(cts[k,]>0.7)[0] IDX = cupy.where(cts[k,]>0.7)[0] o = train.iloc[cupy.asnumpy(IDX)].posting_id.values preds.append(o) del model, text_embeddings train[‘oof_text‘] = preds if COMPUTE_CV: train[‘f1‘] = train.apply(getMetric(‘oof_text‘),axis=1) print(‘CV score for baseline =‘,train.f1.mean())
#图片和文本进行融合 def combine_for_sub(row): x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash]) return ‘ ‘.join( np.unique(x) ) def combine_for_cv(row): x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash]) return np.unique(x) if COMPUTE_CV: tmp = train.groupby(‘label_group‘).posting_id.agg(‘unique‘).to_dict() train[‘target‘] = train.label_group.map(tmp) train[‘oof‘] = train.apply(combine_for_cv,axis=1) train[‘f1‘] = train.apply(getMetric(‘oof‘),axis=1) print(‘CV Score =‘, train.f1.mean() ) train[‘matches‘] = train.apply(combine_for_sub,axis=1) #生成预测信息 train[[‘posting_id‘,‘matches‘]].to_csv(‘submission.csv‘,index=False)
使用图片和文本多模态形式,分数达到0.711
原文:https://www.cnblogs.com/momo521/p/14605649.html