clip

https://www.cnblogs.com/chester-cs/p/17478159.html https://github.com/openai/CLIP/blob/main/clip/model.py https://github.com/moein-shariatnia/OpenAI-CLIP/blob/master/CLIP.py


    def forward(self, batch):
        # Getting Image and Text Features
        image_features = self.image_encoder(batch["image"])
        text_features = self.text_encoder(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        # Getting Image and Text Embeddings (with same dimension)
        image_embeddings = self.image_projection(image_features)
        text_embeddings = self.text_projection(text_features)

        # Calculating the Loss
        logits = (text_embeddings @ image_embeddings.T) / self.temperature
        images_similarity = image_embeddings @ image_embeddings.T
        texts_similarity = text_embeddings @ text_embeddings.T
        targets = F.softmax(
            (images_similarity + texts_similarity) / 2 * self.temperature, dim=-1
        )
        texts_loss = cross_entropy(logits, targets, reduction='none')
        images_loss = cross_entropy(logits.T, targets.T, reduction='none')
        loss =  (images_loss + texts_loss) / 2.0 # shape: (batch_size)
        return loss.mean()

def cross_entropy(preds, targets, reduction='none'):
    log_softmax = nn.LogSoftmax(dim=-1)
    loss = (-targets * log_softmax(preds)).sum(1)
    if reduction == "none":
        return loss
    elif reduction == "mean":
        return loss.mean()

image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(image_embeds)

text_embeds = text_outputs[1]
text_embeds = self.text_projection(text_embeds)

# normalized features
image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

# cosine similarity as logits
logit_scale = self.logit_scale.exp()
logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
logits_per_image = logits_per_text.t()

loss = None
if return_loss:
    loss = clip_loss(logits_per_text)

剑指offer 经典

https://zhuanlan.zhihu.com/p/453204032

二分

https://leetcode.cn/problems/find-first-and-last-position-of-element-in-sorted-array 在排序数组中查找元素的第一个和最后一个位置 left_bound right_bound https://leetcode.cn/problems/search-a-2d-matrix-ii/solutions/ #搜索二维矩阵

def searchMatrix(self, matrix: List[List[int]], target: int) -> bool:
        for row in matrix:
            idx = bisect.bisect_left(row, target)
            if idx < len(row) and row[idx] == target:
                return True
        return False

 def searchMatrix(self, matrix: List[List[int]], target: int) -> bool:
        i, j = len(matrix) - 1, 0
        while i >= 0 and j < len(matrix[0]):
            if matrix[i][j] > target: i -= 1
            elif matrix[i][j] < target: j += 1
            else: return True
        return False

https://leetcode.cn/problems/search-in-rotated-sorted-array/ # 搜索旋转排序数组(关键点在和开头结尾的比一下)

def search(self, nums, target):
      """
      :type nums: List[int]
      :type target: int
      :rtype: int
      """
      if len(nums) <= 0:
          return -1
      l,r = 0,len(nums)-1
      while l <= r:
          mid = (l + r) // 2
          if nums[mid] == target:
              return mid
          elif nums[r] == target:
              return r
          elif nums[l] == target:
              return l
          elif nums[mid] > nums[l]:
              if nums[l]< target < nums[mid]:
                  r = mid - 1
              else:
                  l = mid + 1
          else:
              if nums[mid] < target <= nums[r]:
                  l = mid + 1
              else:
                  r = mid - 1
      return l if nums[l] == target else -1

-欧拉距离

# coding
https://juejin.cn/s/pytorch%20multiheadattention%E4%BD%BF%E7%94%A8
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()

        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)

        # 全连接层变换
        k = self.k_linear(k).view(bs, -1, self.num_heads, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.num_heads, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.num_heads, self.d_k)

        # 转置操作
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)

        # 计算Scaled Dot-Product Attention
        scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(self.d_k)
        if mask is not None:
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = nn.functional.softmax(scores, dim=-1)
        output = torch.matmul(scores, v)

        # 拼接操作
        output = output.transpose(1,2).contiguous().view(bs, -1, self.num_heads*self.d_k)

        # 全连接层变换
        output = self.out(output)
        return output

LIS LCS

https://zhuanlan.zhihu.com/p/62521862

# 最长公共子串
dp[i][j] = dp[i - 1][j - 1] + 1 else 0
# 最长公共子序列，不用连续
dp = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)]
  lcs = ''
  for i in range(1, len(s1) + 1):
      for j in range(1, len(s2) + 1):
          if s1[i - 1] == s2[j - 1]:
              dp[i][j] = dp[i-1][j-1] + 1
          else:
              dp[i][j]  = max(dp[i-1][j], dp[i][j-1])
# 最长递增子序列
def lengthOfLIS(self, nums: List[int]) -> int:
        if not nums: return 0
        dp = [1] * len(nums)
        for i in range(len(nums)):
            for j in range(i):
                if nums[j] < nums[i]: # 如果要求非严格递增，将此行 '<' 改为 '<=' 即可。
                    dp[i] = max(dp[i], dp[j] + 1)
        return max(dp)
# 编辑距离 https://blog.51cto.com/u_15408171/6869532
不同之处在于，LCS对两个的长度差异不敏感，编辑距离对两者的长度差异敏感。 LCS衡量了两者的重合度，编辑距离衡量了两者的长度和重合度。 对编辑距离的增删代价取0，改操作换成相同奖励，就是LCS。

if word1[i - 1] == word2[j - 1]:
    dp[i][j] = dp[i - 1][j - 1]
else:
    dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])+1

背包

https://zhuanlan.zhihu.com/p/35643721

C = [3,2,6,7,1,4,9,5]
V = [6,3,5,8,3,1,6,9]
Count = [3,5,1,9,3,5,6,8]#每种物品的实现
target = 20
F = [0 for i in range(0,target+1)]
n = len(C)

def CompleteBackPack(cost,value):
    for i in range(cost,target+1):
        F[i] = max(F[i],F[i-cost]+value)

def OneZeroBackPack(cost,value):
    for i in reversed(range(cost,target+1)):
        F[i] = max(F[i],F[i-cost]+value)

def MultipleBackPack(cost,value,count):

        if (cost * count) >= target:#当该种物品的个数乘以体积大于背包容量，视为有无限个即完全背包
            CompleteBackPack(C[i],V[i])
            return
        temp_count = 1  #以上情况不满足，转化为以下情况，具体参考《背包九讲》多重背包的时间优化
        while(temp_count<count):
            OneZeroBackPack(temp_count*cost,temp_count*value)
            count = count - temp_count
            temp_count = temp_count * 2  #转化为1，2，4
        OneZeroBackPack(count*cost,count*value)#9个中剩下两个

for i in range(0,n):
    MultipleBackPack(C[i],V[i],Count[i])
print (F[target])

二分

int binary_search(int[] nums, int target) {
    int left = 0, right = nums.length - 1; 
    while(left <= right) {
        int mid = left + (right - left) / 2;
        if (nums[mid] < target) {
            left = mid + 1;
        } else if (nums[mid] > target) {
            right = mid - 1; 
        } else if(nums[mid] == target) {
            // 直接返回
            return mid;
        }
    }
    // 直接返回
    return -1;
}

int left_bound(int[] nums, int target) {
    int left = 0, right = nums.length - 1;
    while (left <= right) {
        int mid = left + (right - left) / 2;
        if (nums[mid] < target) {
            left = mid + 1;
        } else if (nums[mid] > target) {
            right = mid - 1;
        } else if (nums[mid] == target) {
            // 别返回，锁定左侧边界
            right = mid - 1;
        }
    }
    // 判断 target 是否存在于 nums 中
    if (left < 0 || left >= nums.length) {
        return -1;
    }
    // 判断一下 nums[left] 是不是 target
    return nums[left] == target ? left : -1;
}

int right_bound(int[] nums, int target) {
    int left = 0, right = nums.length - 1;
    while (left <= right) {
        int mid = left + (right - left) / 2;
        if (nums[mid] < target) {
            left = mid + 1;
        } else if (nums[mid] > target) {
            right = mid - 1;
        } else if (nums[mid] == target) {
            // 别返回，锁定右侧边界
            left = mid + 1;
        }
    }
    // 由于 while 的结束条件是 right == left - 1，且现在在求右边界
    // 所以用 right 替代 left - 1 更好记
    if (right < 0 || right >= nums.length) {
        return -1;
    }
    return nums[right] == target ? right : -1;
}

堆排序

def buildMaxHeap(arr):
    import math
    for i in range(math.floor(len(arr)/2),-1,-1):
        heapify(arr,i)

def heapify(arr, i):
    left = 2*i+1
    right = 2*i+2
    largest = i
    if left < arrLen and arr[left] > arr[largest]:
        largest = left
    if right < arrLen and arr[right] > arr[largest]:
        largest = right

    if largest != i:
        swap(arr, i, largest)
        heapify(arr, largest)

def swap(arr, i, j):
    arr[i], arr[j] = arr[j], arr[i]

def heapSort(arr):
    global arrLen
    arrLen = len(arr)
    buildMaxHeap(arr)
    for i in range(len(arr)-1,0,-1):
        swap(arr,0,i)
        arrLen -=1
        heapify(arr, 0)
    return arr

快排

import random
import sys
sys.setrecursionlimit(10000000)             #设置系统最大递归深度

def quick_sort(data, left, right):
    if left < right:
        mid = partition(data, left, right)    # mid返回的是上一个用来排序那个数的下标
        quick_sort(data, left, mid - 1)
        quick_sort(data, mid + 1,right)

# 每执行一次partition函数都可以实现将某个数左边都比这个数小右边都比这个数大
def partition(data, left, right):
    tmp = data[left]
    while left < right:
        while left < right and data[right] >= tmp:     # 从右向左找小于tmp的数放到左边空位置
            right -= 1
        # data[left] = data[right]                       # 将右边小于tmp值得数放到左边空位置
        while left < right and data[left] <= tmp:      # 从左向右找到大于tmp的值放到右边空位置
            left += 1
        data[right] = data[left]                       # 将右边大于tmp值得数放到右边空位置
    data[left] = tmp
    return left

data = list(range(100))
random.shuffle(data)                                 #将有序列表打乱
quick_sort(data, 0, len(data) - 1)
print(data)

ref https://www.kaggle.com/code/bigironsphere/loss-function-library-keras-pytorch

# encoding: utf-8

import numpy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.ops.focal_loss import sigmoid_focal_loss

class DiceLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(DiceLoss, self).__init__()

    def forward(self, inputs, targets, smooth=1):
        # comment out if your model contains a sigmoid or equivalent activation layer
        inputs = F.sigmoid(inputs)

        # flatten label and prediction tensors
        inputs = inputs.view(-1)
        targets = targets.view(-1)

        intersection = (inputs * targets).sum()
        dice = (2. * intersection + smooth) / (inputs.sum() + targets.sum() + smooth)

        return 1 - dice

class DiceBCELoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(DiceBCELoss, self).__init__()

    def forward(self, inputs, targets, smooth=1):
        # comment out if your model contains a sigmoid or equivalent activation layer
        inputs = F.sigmoid(inputs)

        # flatten label and prediction tensors
        inputs = inputs.view(-1)
        targets = targets.view(-1)

        intersection = (inputs * targets).sum()
        dice_loss = 1 - (2. * intersection + smooth) / (inputs.sum() + targets.sum() + smooth)
        BCE = F.binary_cross_entropy(inputs, targets, reduction='mean')
        Dice_BCE = BCE + dice_loss

        return Dice_BCE

# ALPHA = 0.8
# GAMMA = 2

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs: torch.Tensor, targets: torch.Tensor):
        # target 的需要进行onehot编码
        if inputs.shape != targets.shape:
            batch_size, num_classes = inputs.shape
            targets = torch.nn.functional.one_hot(targets, num_classes)
        targets = targets.to(inputs.device, dtype=torch.float)
        # mean > sum
        return sigmoid_focal_loss(inputs, targets, alpha=self.alpha, gamma=self.gamma, reduction="mean")

# ALPHA = 0.5
# BETA = 0.5

class TverskyLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(TverskyLoss, self).__init__()

    def forward(self, inputs, targets, smooth=1, alpha=0.5, beta=0.5):
        # comment out if your model contains a sigmoid or equivalent activation layer
        inputs = F.sigmoid(inputs)

        # flatten label and prediction tensors
        inputs = inputs.view(-1)
        targets = targets.view(-1)

        # True Positives, False Positives & False Negatives
        TP = (inputs * targets).sum()
        FP = ((1 - targets) * inputs).sum()
        FN = (targets * (1 - inputs)).sum()

        Tversky = (TP + smooth) / (TP + alpha * FP + beta * FN + smooth)

        return 1 - Tversky

# ALPHA = 0.5
# BETA = 0.5
# GAMMA = 1

class FocalTverskyLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(FocalTverskyLoss, self).__init__()

    def forward(self, inputs, targets, smooth=1, alpha=0.5, beta=0.5, gamma=1):
        # comment out if your model contains a sigmoid or equivalent activation layer
        inputs = F.sigmoid(inputs)

        # flatten label and prediction tensors
        inputs = inputs.view(-1)
        targets = targets.view(-1)

        # True Positives, False Positives & False Negatives
        TP = (inputs * targets).sum()
        FP = ((1 - targets) * inputs).sum()
        FN = (targets * (1 - inputs)).sum()

        Tversky = (TP + smooth) / (TP + alpha * FP + beta * FN + smooth)
        FocalTversky = (1 - Tversky) ** gamma

        return FocalTversky

class AddTransTextLoss(nn.Module):
    def __init__(self, weight=0.5):
        super(AddTransTextLoss, self).__init__()
        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.weight = weight

    def forward(self, inputs, inputs2, targets):
        """
        @inputs: 对应原始输出样本
        @inputs2：对应的原始输出正样本
        """
        main_loss = self.loss_fn(inputs, targets)  # 计算主要的损失
        auxi_loss = self.loss_fn(inputs2, targets)  # 计算辅助的损失: 利用翻译结果进行分类
        tot_loss = main_loss + auxi_loss * self.weight
        return tot_loss, auxi_loss

class ContrastiveLoss(nn.Module):
    def __init__(self, num_labels, weight=0.5, temp=0.05):
        super(ContrastiveLoss, self).__init__()
        self.main_loss_fn = torch.nn.CrossEntropyLoss()
        self.auxi_loss_fn = torch.nn.BCEWithLogitsLoss()
        self.weight = weight
        self.num_labels = num_labels
        self.temp = temp

    def forward(self, inputs, targets, embs, embs2):
        """
        @input: logits
        @target: labels
        @embeds: 向量表征
        @embs2: 对应的正样本的表征
        """
        main_loss = self.main_loss_fn(inputs, targets)  # 计算主要的损失
        deivce = main_loss.device

        # 少样本可能需要辅助正样本
        embs = torch.concat([embs, embs2], dim=0)
        targets = torch.concat([targets, targets], dim=0)

        # 进行正则化
        batch_size, emb_size = embs.shape
        norm_embs = F.normalize(embs, dim=1, p=2)
        sim_score = torch.matmul(norm_embs, norm_embs.T)
        sim_score = sim_score - torch.eye(batch_size).to(deivce) * 1e12  # 自己和自己相似度最小
        sim_score = sim_score / self.temp  # 温度参数进行处理

        # 计算label
        one_hot = F.one_hot(targets, self.num_labels).detach().cpu()
        multi_labels = torch.matmul(one_hot, one_hot.T)  # 是一个mutil label问题
        mask = 1 - torch.eye(batch_size)  # 自己不能和自己计算相似性
        multi_labels = (mask * multi_labels).to(deivce)
        auxi_loss = self.auxi_loss_fn(sim_score, multi_labels)
        # print(main_loss, auxi_loss)
        tot_loss = main_loss + auxi_loss * self.weight
        return tot_loss, auxi_loss

if __name__ == '__main__':
    import numpy as np

    loss_func = ContrastiveLoss(num_labels=55)
    inputs = torch.FloatTensor(np.random.randn(10, 55))
    targets = torch.LongTensor(np.random.randint(0, 55, 10))
    print(targets)
    loss = loss_func(inputs, targets, inputs, inputs)
    print(loss)

 def findPeakElement(self, nums: List[int]) -> int:
        left, right = 0, len(nums) - 1
        while left < right:
            mid = (left + right) >> 1
            if nums[mid] > nums[mid + 1]:
                right = mid
            else:
                left = mid + 1
        return left

# 单调队列 https://zhuanlan.zhihu.com/p/447209490
# 单调队列主要是为了求滑动窗口最大/最小值。单调队列是双端队列（首尾两边都可以append和pop）。具体而言，我们会在单调队列的队尾pop和append，会在队首pop。队首的元素是我们需要的最值（这一点非常重要），最大值就递减反之递增
# https://leetcode.cn/problems/sliding-window-maximum/
def maxSlidingWindow(self, nums: List[int], k: int) -> List[int]:
        n = len(nums)
        q = collections.deque()
        for i in range(k):
            while q and nums[i] >= nums[q[-1]]:
                q.pop()
            q.append(i)

        ans = [nums[q[0]]]
        for i in range(k, n):
            while q and nums[i] >= nums[q[-1]]:
                q.pop()
            q.append(i)
            while q[0] <= i - k:
                q.popleft()
            ans.append(nums[q[0]])

        return ans
arr = [2, 3, 7, 9, 5, 1, 6, 4, 3]
k = 3

# Find the maximum element in each sliding window of size k
result = max_sliding_window(arr, k)

# 单调栈
# 单调栈主要用来求每一个当前数左右最近的比他小/大的数
#  Largest Rectangle in Histogram
def largestRectangleArea(self, heights: List[int]) -> int:
        maxArea = 0
        stack = []
        for index , height in enumerate(heights):
            start = index
            while start and stack[-1][1] > height:
                i , h = stack.pop()
                maxArea = max(maxArea , (index-i)*h)
                start = i
            stack.append((start , height))

        for index , height in stack:
            maxArea = max(maxArea , (len(heights)-index)*height)

        return maxArea
# [Maximal Rectangle](https://leetcode.com/problems/maximal-rectangle/)
def maximalRectangle(self, matrix: List[List[str]]) -> int:
        def mah(heights: List[int]) -> int:
            st=[]
            maxArea=0
            for bar in heights+[-1]:
                step=0
                while st and st[-1][1]>=bar:
                    w,h=st.pop()
                    step+=w
                    maxArea=max(maxArea,step*h)
                st.append((step+1,bar))
            return maxArea
        n,m=len(matrix),len(matrix[0])
        ans=0
        for i in range(n):
            for j in range(m):
                if matrix[i][j]=='1':
                    matrix[i][j]=1
                else:
                    matrix[i][j]=0 
                if i>0 and matrix[i][j]!=0:
                    matrix[i][j]=matrix[i-1][j]+matrix[i][j]
            ans=max(ans,mah(matrix[i]))
        return ans

# 最近公共祖先
    TreeNode* lowestCommonAncestor(TreeNode* root, TreeNode* p, TreeNode* q) {
        if (!root || root == p || root == q) return root;
        TreeNode *left = lowestCommonAncestor(root->left, p, q);
        TreeNode *right = lowestCommonAncestor(root->right, p, q);
        if (left && right) return root;
        return left ? left : right;
    }

# 双指针 接雨水
def trap(self, height: List[int]) -> int:
    left, right = 0, len(height) - 1
    l_max, r_max = 0, 0
    res = 0
    while left < right:
        l_max = max(l_max, height[left])
        r_max = max(r_max, height[right])
        if l_max < r_max:
            res += l_max - height[left]
            left += 1
        else:
            res += r_max - height[right]
            right -= 1
    return res

# 双指针  几根柱子之间最大面积
 def maxArea(self, height: List[int]) -> int:
        left, right = 0, len(height) - 1
        res = 0
        while left < right:
            # [left, right] 之间的矩形面积
            cur_area = min(height[left], height[right]) * (right - left)
            res = max(res, cur_area)
            # 双指针技巧，移动较低的一边
            if height[left] < height[right]:
                left += 1
            else:
                right -= 1
        return res
# [乘积最大子数组](https://leetcode.cn/problems/maximum-product-subarray)
#

模型知识

为什么用layernorm? 长度不一样，图像resize差不多；解决输入分布偏差，在过激活层的时候，容易陷入激活层的梯度饱和区，降低模型收敛速度。Norm起作用的本质是它平滑了Loss，保持了梯度下降过程中的稳定。
- 相比于稳定前向输入分布，反向传播时mean和variance计算引入的[梯度]更有用，可以稳定反向传播时loss对输入的梯度
为什么除开根号q？ https://www.zhihu.com/question/526006602
- 这里主要作用是使模型训练过程中梯度值保持稳定。假设A= Q·KT，除以根号下dk的主要原因是A的分布和方差d有关，当A的分布陡峭，会使模型训练过程中梯度值不稳定，所以除以根号下dk会使训练过程梯度值保持稳定。
为什么warmup？ https://cloud.tencent.com/developer/article/1661947
- 训练开始的时候先选择使用一个较小的学习率，避免落入minibatch局部最优
神经网络中的偏置（bias）究竟有什么用
- 一、提高模型的表达力；
- 二、增加模型的灵活性；
- 三、保证激活函数工作在非线性区域；
- 四、防止模型过拟合。神经网络的偏置参数可以增加模型的表达力。
初始化神经网络的参数可以全为 0 吗？https://zhuanlan.zhihu.com/p/75879624
- 不可以，更新后的参数在每一层内都是相同的。同时，无论经过多少次网络训练，相同网络层内的参数值都是相同的，这会导致网络在学习时没有重点，对所有的特征处理相同，这很可能导致模型无法收敛训练失败。这种现象被称为对称失效。同样地，当权重被初始化为相同的非零值时，也会出现上述情况

LLM

LLM速度优化方式
flashattention；paged attention； https://zhuanlan.zhihu.com/p/638468472
- flash 减少带宽消耗，利用GPU内存快速的部分计算，将长度为N的句子的Q和{K，V}对分成诸多小块，外循环和内循环在长度轴N上进行，循环计算attention的结果:不维护N*N的大矩阵。
- paged 是节省显存浪费；kv cache 不必连续存放，避免碎片化，无感知的。

幻觉原因

数据层面；包含虚假信息/信息重复记忆偏差
模型层面；暴露偏差；解码算法；幻觉处理
利用模型打分，筛选出可能导致幻觉的数据并剔除；
预训练时给更faithful的数据加权（wiki vs. fake news），或者不使用可靠来源的数据（比如只选用经过人工审查的数据源，如wiki或者教科书，预训练）
多任务学习: 通过设计合适的额外任务，可以达到减轻幻觉的效果。
后处理：设计小模型专门用于fix掉幻觉错误
检索增强：LlamaIndex

推理速度问题

小模型
转为index
flashattention
pageattention
batch 逐个控制

长文本

llama2-4K，gpt3-16K 预训练长度和推理不一致外推性/速度问题

持续训练，ROPE编码。RoPE就是将表示单词、数字等信息的token embeddings映射到3D图表上，给出它们相对于其他token的位置——即使在旋转时也如此。long减少了旋转角度
检索库
位置编码的外推性
- 方案：线性插值。https://zhuanlan.zhihu.com/p/647145964
longLoRA，分组和偏移的方式来对全局自注意力机制进行模拟；embed层和norm层微调
StreamingLLM，attention sink kv计算最开始的点

# https://www.tensorflow.org/addons/api_docs/python/tfa/losses/contrastive_loss
# https://towardsdatascience.com/creating-custom-loss-functions-using-tensorflow-2-96c123d5ce6c
      def contrastive_loss(y_true, y_pred, margin=0.5):
          square_pred = tf.square(y_pred)
          margin_square = tf.square(tf.maximum(margin - y_pred, 0))
          return tf.reduce_mean((1-y_true) * square_pred + (y_true) * margin_square)

      loss = contrastive_loss(y_true, y_pred, 0.5)
      # return tf.reduce_sum(loss * is_contra)/tf.reduce_sum(is_contra)
      return tf.reduce_mean(loss)

你对这个工作职位的理解你工作倾向你讨厌的工作习惯是什么不同工作你care不同点是什么

work card strap why leave

sg shrink the budget and no new hc
ecommerce is very mature and no new things harsh, profit margin getting lower and lower why join
with the development the LLM and AIGC, The speed of content creation is getting faster and faster. need more resource to mointor and guide .it's a emerging market. model fight model ,rather than people

expectation

do difficult(tech hard/no resource; read paper and )
process vitim word, using span
resource,

shortcoming

anxious advantage
think and do and do things very fast

Team Understanding

responsibly for TNS labeling data, Building an automated LLM model training system.

motivation strong interest in joining team your understand of tis role what makes this role attractive to you

data shield whats your current responsibilities
online cate requirement(transfer/fuse the all business)
virtual cates
optimize the item content,search experiment metrics for team
assign work (3 part/low/mid/high speed/new idea,own the direction)
help others(show initiative/take the initiative;) reactive/proactive 消极的/积极的
expand the scope businesss metrics
search team:order quantity
seller team:adoption rate;miscate rate

how would you describe ur work/leadership style

leading by example. build the framework and write the landing document.let next new guy to refine and update it.
share the new idea,
align the target and monitor the process. evaluate the task difficulty level of problem and then how would u motivate team
“As a manager, it’s my job to know my employees well. I need to find out what they are passionate about and enjoy doing. Once I establish these things, I can credit them for their work
Lead by delegating and making others better

your lead experience

on LLM, use the LLM to replace the existing pipeline. analysis the keypoint that LLM can do, write prompt to explore the boundary and then break down different parts, one for survey tech itself one for data collection and cleaning one for Infrastructure, and set timeline ,milestone. twice a week to align everyone progress.

what unique engineering challenges you foresee in TnS

some member don't have too much experience in wiritng code
fast pace and Adaptation the rule change in

Your expectation for this role, what you want to learn from it.

pigbreeder / CodeMemo

力抠 #20

clip

剑指offer 经典

二分

LIS LCS

背包

二分

堆排序

快排

模型知识

LLM

推理速度问题

长文本