How to free up tensor from GPU while using en_core_web_trf for inference

kaushikepi commented 3 years ago

Getting OOM while inferencing documents using the spacy Transformer model. The GPU memory is not getting free up when doc object get created.

svlandeg commented 3 years ago

Were you able to resolve this (as seems to be the case from https://github.com/explosion/spaCy/discussions/7486)?

kaushikepi commented 3 years ago

@svlandeg I tried the solution mentioned in #7486 but it didn't work.

import time
from http import HTTPStatus
from typing import Any, Dict, List, Optional

import spacy
import torch
from fastapi import FastAPI, Request
from pydantic import BaseModel
from spacy.language import Language
from thinc.api import require_gpu, set_gpu_allocator

app = FastAPI(
    title="NPI - Made with Transformers",
    description="Predict provider namse given a text input from a medical chart",
    version="0.1",
)

if spacy.prefer_gpu():
    print("\n\033[92m" + "✔ Using GPU" + "\033[0m\n")
    set_gpu_allocator("pytorch")
    require_gpu(0)
else:
    print("\n\033[91m" + "✘ NOT Using GPU!" + "\033[0m\n")

torch.set_num_threads(1)
nlp_trf = spacy.load(
    "en_core_web_trf", exclude=["tagger", "parser", "attribute_ruler", "lemmatizer"]
)

@Language.component("remove_trf_data")
def remove_trf_data(doc):
    doc._.trf_data = None
    return doc

nlp_trf.add_pipe("remove_trf_data")

class RecordRequest(BaseModel):
    page_no: str
    page_text: str

class RecordRequestIn(BaseModel):
    values: List[RecordRequest]

class RecordResponseOut(BaseModel):
    entities: Dict[str, Optional[List[Any]]]
    time_taken: str

@app.get("/", tags=["General"])
def _index(request: Request):
    """Health check."""
    response = {
        "message": HTTPStatus.OK.phrase,
        "status-code": HTTPStatus.OK,
        "nlp_pipeline": nlp_trf.pipe_names,
        "data": {},
    }

    return response

@app.post("/entities", response_model=RecordResponseOut)
def extract_per_entities(body: RecordRequestIn):
    start_time = time.time()

    records = body.values
    chart_info_pagewise = {}

    page_ids = [doc.page_no for doc in records]
    texts = [doc.page_text for doc in records]

    for doc_id, spacy_doc in zip(page_ids, nlp_trf.pipe(texts)):
        pagewise_entities = []
        for ent in spacy_doc.ents:
            if ent.label_ == "PERSON":
                pagewise_entities.append(
                    (ent.text, ent.start, ent.end, ent.start_char, ent.end_char)
                )
        if not pagewise_entities:
            chart_info_pagewise[doc_id] = None
        else:
            chart_info_pagewise[doc_id] = pagewise_entities

    response = {"entities": chart_info_pagewise, "time_taken": str(time.time() - start_time)}
    return response

Package version using

spacy==3.0.5 spacy-transformers==1.0.4 cupy-cuda100

explosion / spaCy

How to free up tensor from GPU while using en_core_web_trf for inference #9086

Package version using