对于直方图向量,我们引入 TF-IDF 权值
我们需要对每一个词给一个权重。而且这个权重必须满足以下两个条件:
TF-IDF 是通过增加权重的方法,凸显出重要的关键信息。同样的,在图像检索中,为了更精确地度量相似性,我们也在原来直方图向量的基础上,为向量的每一项增加权重。按照上面信息检索的方法,我们需要给字典里的每个向量(visual word)设置权重。

为创建视觉单词词汇,首先需要提取特征描述子,使用SIFT特征描述子,得到每幅图像提取的描述子,并将每幅图像的描述子保存在一个文件中:
# -*- coding: utf-8 -*-
import pickle
from PCV.imagesearch import vocabulary
from PCV.tools.imtools import get_imlist
from PCV.localdescriptors import sift
#获取图像列表
imlist = get_imlist(‘E:/test_pic/BOF/‘)
nbr_images = len(imlist)
#获取特征列表
featlist = [imlist[i][:-3]+‘sift‘ for i in range(nbr_images)]
#提取文件夹下图像的sift特征
for i in range(nbr_images):
sift.process_image(imlist[i], featlist[i])
#生成词汇
voc = vocabulary.Vocabulary(‘ukbenchtest‘)
voc.train(featlist, 1000, 10)
#保存词汇
# saving vocabulary
with open(‘E:/test_pic/BOF/vocabulary.pkl‘, ‘wb‘) as f:
pickle.dump(voc, f)
print (‘vocabulary is:‘, voc.name, voc.nbr_words)

同时生成数据文件vocabulary.pkl

# -*- coding: utf-8 -*-
import pickle
from PCV.imagesearch import imagesearch
from PCV.localdescriptors import sift
from sqlite3 import dbapi2 as sqlite
from PCV.tools.imtools import get_imlist
#获取图像列表
imlist = get_imlist(‘E:/test_pic/BOF/‘)
nbr_images = len(imlist)
#获取特征列表
featlist = [imlist[i][:-3]+‘sift‘ for i in range(nbr_images)]
# load vocabulary
#载入词汇
with open(‘E:/test_pic/BOF/vocabulary.pkl‘, ‘rb‘) as f:
voc = pickle.load(f)
#创建索引
indx = imagesearch.Indexer(‘testImaAdd.db‘,voc)
indx.create_tables()
# go through all images, project features on vocabulary and insert
#遍历所有的图像,并将它们的特征投影到词汇上
for i in range(nbr_images)[:500]:
locs,descr = sift.read_features_from_file(featlist[i])
indx.add_to_index(imlist[i],descr)
# commit to database
#提交到数据库
indx.db_commit()
con = sqlite.connect(‘testImaAdd.db‘)
print (con.execute(‘select count (filename) from imlist‘).fetchone())
print (con.execute(‘select * from imlist‘).fetchone())
此处会报错:

如果你不是装了所有的包,(我是安装的Anaconda,所以可以直接运行),就需要点进imagesearch中修改一下代码:
将其内部所有代码替换成为:
from numpy import *
import pickle
import sqlite3
from functools import cmp_to_key
import operator
class Indexer(object):
def __init__(self, db, voc):
""" Initialize with the name of the database
and a vocabulary object. """
self.con = sqlite3.connect(db)
self.voc = voc
def __del__(self):
self.con.close()
def db_commit(self):
self.con.commit()
def get_id(self, imname):
""" Get an entry id and add if not present. """
cur = self.con.execute(
"select rowid from imlist where filename=‘%s‘" % imname)
res = cur.fetchone()
if res == None:
cur = self.con.execute(
"insert into imlist(filename) values (‘%s‘)" % imname)
return cur.lastrowid
else:
return res[0]
def is_indexed(self, imname):
""" Returns True if imname has been indexed. """
im = self.con.execute("select rowid from imlist where filename=‘%s‘" % imname).fetchone()
return im != None
def add_to_index(self, imname, descr):
""" Take an image with feature descriptors,
project on vocabulary and add to database. """
if self.is_indexed(imname): return
print(‘indexing‘, imname)
# get the imid
imid = self.get_id(imname)
# get the words
imwords = self.voc.project(descr)
nbr_words = imwords.shape[0]
# link each word to image
for i in range(nbr_words):
word = imwords[i]
# wordid is the word number itself
self.con.execute("insert into imwords(imid,wordid,vocname) values (?,?,?)", (imid, word, self.voc.name))
# store word histogram for image
# use pickle to encode NumPy arrays as strings
self.con.execute("insert into imhistograms(imid,histogram,vocname) values (?,?,?)",
(imid, pickle.dumps(imwords), self.voc.name))
def create_tables(self):
""" Create the database tables. """
self.con.execute(‘create table imlist(filename)‘)
self.con.execute(‘create table imwords(imid,wordid,vocname)‘)
self.con.execute(‘create table imhistograms(imid,histogram,vocname)‘)
self.con.execute(‘create index im_idx on imlist(filename)‘)
self.con.execute(‘create index wordid_idx on imwords(wordid)‘)
self.con.execute(‘create index imid_idx on imwords(imid)‘)
self.con.execute(‘create index imidhist_idx on imhistograms(imid)‘)
self.db_commit()
class Searcher(object):
def __init__(self, db, voc):
""" Initialize with the name of the database. """
self.con = sqlite3.connect(db)
self.voc = voc
def __del__(self):
self.con.close()
def get_imhistogram(self, imname):
""" Return the word histogram for an image. """
im_id = self.con.execute(
"select rowid from imlist where filename=‘%s‘" % imname).fetchone()
s = self.con.execute(
"select histogram from imhistograms where rowid=‘%d‘" % im_id).fetchone()
# use pickle to decode NumPy arrays from string
return pickle.loads(s[0])
def candidates_from_word(self, imword):
""" Get list of images containing imword. """
im_ids = self.con.execute(
"select distinct imid from imwords where wordid=%d" % imword).fetchall()
return [i[0] for i in im_ids]
def candidates_from_histogram(self, imwords):
""" Get list of images with similar words. """
# get the word ids
words = imwords.nonzero()[0]
# find candidates
candidates = []
for word in words:
c = self.candidates_from_word(word)
candidates += c
# take all unique words and reverse sort on occurrence
tmp = [(w, candidates.count(w)) for w in set(candidates)]
tmp.sort(key=cmp_to_key(lambda x, y: operator.gt(x[1], y[1])))
tmp.reverse()
# return sorted list, best matches first
return [w[0] for w in tmp]
def query(self, imname):
""" Find a list of matching images for imname. """
h = self.get_imhistogram(imname)
candidates = self.candidates_from_histogram(h)
matchscores = []
for imid in candidates:
# get the name
cand_name = self.con.execute(
"select filename from imlist where rowid=%d" % imid).fetchone()
cand_h = self.get_imhistogram(cand_name)
cand_dist = sqrt(sum(self.voc.idf * (h - cand_h) ** 2))
matchscores.append((cand_dist, imid))
# return a sorted list of distances and database ids
matchscores.sort()
return matchscores
def get_filename(self, imid):
""" Return the filename for an image id. """
s = self.con.execute(
"select filename from imlist where rowid=‘%d‘" % imid).fetchone()
return s[0]
def tf_idf_dist(voc, v1, v2):
v1 /= sum(v1)
v2 /= sum(v2)
return sqrt(sum(voc.idf * (v1 - v2) ** 2))
def compute_ukbench_score(src, imlist):
""" Returns the average number of correct
images on the top four results of queries. """
nbr_images = len(imlist)
pos = zeros((nbr_images, 4))
# get first four results for each image
for i in range(nbr_images):
pos[i] = [w[1] - 1 for w in src.query(imlist[i])[:4]]
# compute score and return average
score = array([(pos[i] // 4) == (i // 4) for i in range(nbr_images)]) * 1.0
return sum(score) / (nbr_images)
# import PIL and pylab for plotting
from PIL import Image
from pylab import *
def plot_results(src, res):
""" Show images in result list ‘res‘. """
figure()
nbr_results = len(res)
for i in range(nbr_results):
imname = src.get_filename(res[i])
subplot(1, nbr_results, i + 1)
imshow(array(Image.open(imname)))
axis(‘off‘)
show()
运行上面代码后,会在根目录生成建立的索引数据库testImaAdd.db
# -*- coding: utf-8 -*-
import pickle
from PCV.imagesearch import imagesearch
from PCV.localdescriptors import sift
from sqlite3 import dbapi2 as sqlite
from PCV.tools.imtools import get_imlist
#获取图像列表
imlist = get_imlist(‘E:/test_pic/BOF/‘)
nbr_images = len(imlist)
#获取特征列表
featlist = [imlist[i][:-3]+‘sift‘ for i in range(nbr_images)]
#载入词汇
f = open(‘E:/test_pic/BOF/vocabulary.pkl‘, ‘rb‘)
voc = pickle.load(f)
f.close()
src = imagesearch.Searcher(‘testImaAdd.db‘,voc)
locs,descr = sift.read_features_from_file(featlist[0])
iw = voc.project(descr)
print (‘ask using a histogram...‘)
print (src.candidates_from_histogram(iw)[:5])
src = imagesearch.Searcher(‘testImaAdd.db‘,voc)
print (‘try a query...‘)
print(src.query(imlist[0])[:5])
nbr_results = 5
res = [w[1] for w in src.query(imlist[0])[:nbr_results]]
imagesearch.plot_results(src,res)

输入的图像:

运行结果:


生成新的文件

输入图像:

运行结果:

原文:https://www.cnblogs.com/bokeyuancj/p/12876406.html