chen700564 / sdnet

Other
38 stars 6 forks source link

How to make few-shot data #4

Closed YuSawan closed 1 year ago

YuSawan commented 1 year ago

I checked your provided data to make few-shot data with other datasets, but it seems their data size is slightly different per support data. (also, there are only 50 examples of "Soundtrack" in mit-movie1/100-shot.json)

Could you share with me how to make few-shot data?

The number of examples: 1138 {'Plot': 752, 'Actor': 767, 'Genre': 543, 'Year': 492, 'Director': 345, 'Character_Name': 217, 'Opinion': 202, 'Origin': 199, 'Relationship': 176, 'Award': 154, 'Quote': 115, 'Soundtrack': 50}

The number of examples: 1137 {'Plot': 749, 'Actor': 759, 'Genre': 549, 'Year': 465, 'Director': 338, 'Character_Name': 240, 'Opinion': 194, 'Origin': 200, 'Relationship': 182, 'Award': 149, 'Quote': 116, 'Soundtrack': 50}

The number of examples: 1142 {'Plot': 733, 'Actor': 773, 'Genre': 557, 'Year': 479, 'Director': 343, 'Character_Name': 219, 'Opinion': 224, 'Origin': 195, 'Relationship': 183, 'Award': 173, 'Quote': 117, 'Soundtrack': 50}

The number of examples: 1138 {'Plot': 722, 'Actor': 743, 'Genre': 524, 'Year': 470, 'Director': 340, 'Character_Name': 241, 'Opinion': 236, 'Origin': 199, 'Relationship': 182, 'Award': 159, 'Quote': 112, 'Soundtrack': 50}

The number of examples: 1140 {'Plot': 762, 'Actor': 768, 'Genre': 531, 'Year': 471, 'Director': 337, 'Character_Name': 221, 'Opinion': 218, 'Origin': 186, 'Relationship': 171, 'Award': 153, 'Quote': 121, 'Soundtrack': 50}

The number of examples: 1137 {'Plot': 755, 'Actor': 710, 'Genre': 540, 'Year': 483, 'Director': 369, 'Character_Name': 225, 'Opinion': 208, 'Origin': 197, 'Relationship': 179, 'Award': 143, 'Quote': 117, 'Soundtrack': 50}

The number of examples: 1138 {'Plot': 751, 'Actor': 775, 'Genre': 539, 'Year': 500, 'Director': 355, 'Character_Name': 235, 'Opinion': 207, 'Origin': 200, 'Relationship': 164, 'Award': 148, 'Quote': 115, 'Soundtrack': 50}

The number of examples: 1141 {'Plot': 748, 'Actor': 767, 'Genre': 555, 'Year': 463, 'Director': 334, 'Character_Name': 227, 'Opinion': 220, 'Origin': 181, 'Relationship': 188, 'Award': 156, 'Quote': 116, 'Soundtrack': 50}

The number of examples: 1138 {'Plot': 772, 'Actor': 763, 'Genre': 529, 'Year': 454, 'Director': 330, 'Character_Name': 237, 'Opinion': 207, 'Origin': 192, 'Relationship': 171, 'Award': 168, 'Quote': 113, 'Soundtrack': 50}

chen700564 commented 1 year ago

This is because the number of Soundtrack mentions in training set is 50, and we cannot sample 100shot for this type.

For sample support set, you can refer the code:

class Sampler:
    def __init__(self, N, K, samples, classes):
        self.K = K
        self.N = N
        self.samples = samples
        self.classes = classes
        self.support_idx = []

    def __get_candidates__(self, target_classes):
        return [idx for idx, sample in enumerate(self.samples) if target_classes in sample['class_count'] and idx not in self.support_idx]

    def __next__(self):
        support_class = {}
        self.support_idx = []
        for label in self.classes:
            support_class[label] = 0
        for label in self.classes:
            while support_class[label] < self.K:
                candidates = self.__get_candidates__(label)
                if len(candidates) + support_class[label] <= self.K:
                    for index in candidates:
                        if index not in self.support_idx:
                            support_class[label] += 1
                            self.support_idx.append(index)
                    break
                else:
                    index = random.choice(candidates)
                    if index not in self.support_idx:
                        support_class[label] += 1
                        self.support_idx.append(index)
        return None, self.support_idx, None

    def __iter__(self):
        return self

def getsupportset(N,K,dataset,labels):
    sampler = Sampler(N,K,dataset,labels)
    print(N)
    print(K)
    print(labels)
    data = []
    for i in tqdm.tqdm(range(10)):
        _, support_idx, _ = sampler.__next__()
        data.append(support_idx)
    return data

def getdata(file,K=5):
    print(file)
    outfile = file
    dataset = []
    labels = []
    nums = {}
    with open(file+'/train.json') as f:
        for line in tqdm.tqdm(f):
            line = json.loads(line)
            class_count = {}
            for entity in line['entity']:
                if entity['type'] not in labels:
                    labels.append(entity['type'])
                    nums[entity['type']] = 1
                else:
                    nums[entity['type']] += 1 
                if entity['type'] not in class_count:
                    class_count[entity['type']] = 1
                else:
                    class_count[entity['type']] += 1
            line['class_count'] = class_count
            dataset.append(line)
    nums = sorted(nums.items(),key=lambda x:x[1],reverse=True)
    print(nums)
    labels = [i[0] for i in nums]
    supports = getsupportset(len(labels),K,dataset,labels)
    print('sentencenum:',len(supports[0]))
    target_label = labels
    with open(outfile+'/'+str(K)+'shot.json','w') as f:
        for support in supports:
            s = []
            for index in support:
                s.append(dataset[index])
            f.write(json.dumps({'support':s,'target_label':target_label})+'\n')
YuSawan commented 1 year ago

I could get few-samples with my dataset. Thank you so much!