Closed sunnyDX closed 4 years ago
Can you post an example script?
Can you post an example script?
Strangely, core dump happens occasionally,This is core dump gdb information:
(gdb) where
from /opt/taobao/install/ajdk-8.4.8-b211/jre/lib/amd64/server/libjvm.so
at /home/admin/treelite/runtime/native/src/predictor.cc:118
num_feature=30001, num_output_group=1, pred_func_handle=0x7f0b4c93e730 <predict>, rbegin=0, rend=24,
expected_query_result_size=24, out_pred=0x7f0b78027650) at /home/admin/treelite/runtime/native/src/predictor.cc:197
verbose=1, pred_margin=false, out_result=0x7f0b78027650) at /home/admin/treelite/runtime/native/src/predictor.cc:459
pred_margin=false, out_result=0x7f0b78027650) at /home/admin/treelite/runtime/native/src/predictor.cc:495
pred_margin=0, out_result=0x7f0b78027650, out_result_size=0x7f0b4c582260)
at /home/admin/treelite/runtime/native/src/c_api/c_api_runtime.cc:111
jcls=0x7f0b4c5822d8, jhandle=139692267944592, jbatch=139687234795040, jbatch_sparse=1 '\001', jverbose=1 '\001',
jpred_margin=0 '\000', jout_result=0x7f0b4c5822d0, jout_result_size=0x7f0b4c582300)
at /home/admin/treelite/runtime/java/treelite4j/src/native/treelite4j.cpp:180
Corresponding to my debugging code:118
96 template
I have no idea why this is happening. Can you upload your Java application here so that I can try running it?
I have no idea why this is happening. Can you upload your Java application here so that I can try running it?
This core problem has been bothering me for a long time , it only occurs when handle lots of requests at the same time ,I'm afraid you can't reproduce it,Here is the core code:
import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set;
import com.gaode.idccmnpredict.pb.FeatureOuterClass; import com.gaode.idccmnpredict.predict.dto.PredictionResult; import com.google.common.primitives.Floats; import com.google.common.primitives.Ints; import lombok.extern.slf4j.Slf4j; import ml.dmlc.treelite4j.DataPoint; import ml.dmlc.treelite4j.java.BatchBuilder; import ml.dmlc.treelite4j.java.Predictor; import ml.dmlc.treelite4j.java.SparseBatch; import org.apache.commons.lang.ArrayUtils;
@Slf4j public class XgbModelHandler1{
private Predictor predictor;
protected void loadFromPath(String modelPath) throws Exception {
predictor = new Predictor(modelPath, 1, true);
}
protected void scoreOptimusObject(FeatureCacheItem userFeature, Map<String, Object> adOfflineFeature,Map<String, FeatureCacheItem> adRealTimeFeature, PredictionResult out) {
int itemNums = adOfflineFeature.size();
/*
* record invalid sample id:the sample without any feature
*/
Set<String> invalidIdx = new HashSet<>();
/*
* generate the datapoint instance
*/
List<DataPoint> dataPointList = getDataPoint(invalidIdx,userFeature,adOfflineFeature,adRealTimeFeature);
/*
* predict
*/
float[][] results = new float[itemNums][1];
if (dataPointList.size() != 0) {
try {
SparseBatch batch = BatchBuilder.CreateSparseBatch(dataPointList.iterator());
results = predictor.predict(batch, true, false);
} catch (Exception e) {
log.error("[BatchBuilder] SparseBatch Create error", e);
}
}
}
/**
* generate the datapoint instance
* @param invalidIdx
* @param userFeature user feature
* @param adOfflineFeature ad offline feature
* @param adRealTimeFeature ad realtime feature
* @return
*/
protected List<DataPoint> getDataPoint(Set<String> invalidIdx,FeatureCacheItem userFeature, Map<String, Object> adOfflineFeature,Map<String, FeatureCacheItem> adRealTimeFeature){
List<DataPoint> dataPointList = new ArrayList<DataPoint>();
for(Entry<String, Object> entry : adOfflineFeature.entrySet()) {
String item = entry.getKey();
FeatureCacheItem itemFeatures = new FeatureCacheItem(entry.getValue().toString());
if(adRealTimeFeature.keySet().contains(item))
{
itemFeatures.merge(adRealTimeFeature.get(item));
}
int[] indices = ArrayUtils.addAll(userFeature.getKeys(), itemFeatures.getKeys());
float[] values = ArrayUtils.addAll(userFeature.getVals(), itemFeatures.getVals());
if(indices.length > 0 && indices.length == values.length){
dataPointList.add(new DataPoint(indices, values));
} else {
invalidIdx.add(item);
}
}
return dataPointList;
}
}
class FeatureCacheItem { private int[] keys;
public int[] getKeys() {
return keys;
}
public void setKeys(int[] keys) {
this.keys = keys;
}
private float[] vals;
public float[] getVals() {
return vals;
}
public void setVals(float[] vals) {
this.vals = vals;
}
public FeatureCacheItem(int[] keys,float[] vals){
this.keys = keys;
this.vals = vals;
}
public FeatureCacheItem(String val) {
try {
FeatureOuterClass.Feature feature = FeatureOuterClass.Feature.parseFrom(
val.getBytes(StandardCharsets.ISO_8859_1.name()));
keys = Ints.toArray(feature.getKeyList());
vals = Floats.toArray(feature.getValList());
} catch (Exception e) {
keys = new int[0];
vals = new float[0];
}
}
public void merge(FeatureCacheItem featureCacheItem){
this.keys = ArrayUtils.addAll(this.keys,featureCacheItem.getKeys());
this.vals = ArrayUtils.addAll(this.vals,featureCacheItem.getVals());
}
}
I wonder if some SparseBatch
objects are being garbage-collected away.
I wonder if some
SparseBatch
objects are being garbage-collected away.
This is also a point for me to consider. Maybe it's jni problem?
I wonder if some
SparseBatch
objects are being garbage-collected away.
Yes, that's it. C++ is using the CSRBatch object, occasionally SparseBatch has been garbaged by JVM.
At first, I wonder if it's problem with this code:
https://github.com/dmlc/treelite/blob/mainline/runtime/java/treelite4j/src/main/java/ml/dmlc/treelite4j/java/SparseBatch.java#L52
protected void finalize() throws Throwable { super.finalize(); dispose(); }
I changed the sequence, but the problem is still there.
dispose(); super.finalize();
Thanks.
The issue is that SparseBatch
objects are currently accessed in zero-copy fashion, and some SparseBatch
objects are being garbage-collected away resulting into dangling references. In the upcoming code refactoring, I am creating a separate data matrix class and it will require one data copy. Making copy may hurt performance but will eliminate dangling references.
In https://github.com/dmlc/treelite/pull/196, I created a separate data matrix class (DMatrix
) that manages its own memory. Each time the matrix is constructed, the arrays in the parameters will be copied into the matrix. This way, garbage collection will not lead to dangling references.
The recent refactor (#196, #198, #199, #201, #203) created a dedicated data matrix class (DMatrix
) that manages its own memory. As a result, garbage collection will no longer result into dangling references.
ble { super.finalize(); dispose(); }
I changed the sequence, but the problem is still there. @sunnyDX 你好,能加个丁丁吗,我也遇到了相同的问题
Here is the GDB information: gdb /opt/taobao/java/bin/java --core=core-616-java-3489-1584598903
[Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". Core was generated by `/opt/taobao/java/bin/java -server -Xms2g -Xmx2g -Xmn1g -XX:MetaspaceSize=256m -'. Program terminated with signal 6, Aborted.
0 0x00007f9f7cf27277 in raise () from /lib64/libc.so.6
Missing separate debuginfos, use: debuginfo-install ali-jdk-8.4.8-1574344.alios7.x86_64 (gdb) bt
0 0x00007f9f7cf27277 in raise () from /lib64/libc.so.6
1 0x00007f9f7cf28968 in abort () from /lib64/libc.so.6
2 0x00007f9f7c81ffd5 in os::abort(bool) () from /opt/taobao/install/ajdk-8.4.8-b211/jre/lib/amd64/server/libjvm.so
3 0x00007f9f7c9ddce3 in VMError::report_and_die() () from /opt/taobao/install/ajdk-8.4.8-b211/jre/lib/amd64/server/libjvm.so
4 0x00007f9f7c825c32 in JVM_handle_linux_signal () from /opt/taobao/install/ajdk-8.4.8-b211/jre/lib/amd64/server/libjvm.so
5 0x00007f9f7c81bd13 in signalHandler(int, siginfo, void) () from /opt/taobao/install/ajdk-8.4.8-b211/jre/lib/amd64/server/libjvm.so
6
7 0x00007f9e0cbba448 in unsigned long (anonymous namespace)::PredictBatch_(treelite::CSRBatch const, bool, unsigned long, unsigned long, void, unsigned long, unsigned long, unsigned long, float*) [clone .isra.217] ()
from /tmp/libtreelite4j9026278753296637105.so
8 0x00007f9e0cbc6f71 in unsigned long treelite::Predictor::PredictBatchBase_(treelite::CSRBatch const, int, bool, float) () from /tmp/libtreelite4j9026278753296637105.so
9 0x00007f9e0cbcbb75 in TreelitePredictorPredictBatch () from /tmp/libtreelite4j9026278753296637105.so
10 0x00007f9e0cbb4caa in Java_ml_dmlc_treelite4j_TreeliteJNI_TreelitePredictorPredictBatch ()
from /tmp/libtreelite4j9026278753296637105.so
11 0x00007f9f6733f79f in ?? ()
12 0x0000000000000000 in ?? ()