rcongiu / Hive-JSON-Serde

Read - Write JSON SerDe for Apache Hive.
Other
734 stars 391 forks source link

Exception when parsing multi-byte twitter data #18

Closed dmoore247 closed 11 years ago

dmoore247 commented 11 years ago

Good stuff here in this Serde, probably the best out there.

I think we're hitting the same or similar problem parsing twitter data. The problem seems prevalent when parsing tweets with multi-byte characters, e.g. lang="jp" is common in the failures we're seeing.

I'm still working on problem determination, though careful reading of the exception says that "300 " is not being parsed as an int. The only place in the data where the characters 300 occur is at the end of the description field and it's part of a multibyte string thats not even being projected.

It would be great for us if the serde ignored JSON that causes parse exceptions rather than throw the exception.

Many thanks in advance, Douglas

CODE:

CREATE EXTERNAL TABLE tweets_clean (
 created_at STRING,
 lang STRING,
 entities STRUCT<
   urls:ARRAY<STRING>,
   user_mentions:ARRAY<STRING>,
   hashtags:ARRAY<STRUCT<text:STRING>>>
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES ( "ignore.malformed.json" = "true")
LOCATION '/twitter-data/';

SELECT
 LOWER(hashtags.text) AS hashtag,
 unix_timestamp(created_at, "EEE MMM d HH:mm:ss Z yyyy")/900 AS timebucket,
 COUNT(*) AS total_count
FROM tweets_clean
LATERAL VIEW EXPLODE(entities.hashtags) t1 AS hashtags
WHERE hashtags is not null
AND created_at is not null
GROUP BY LOWER(hashtags.text), unix_timestamp(created_at, "EEE MMM d HH:mm:ss Z yyyy")/900
ORDER BY total_count DESC

LOG ENTRY: java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing writable {"in_reply_to_user_id_str":null,"in_reply_to_status_id":null,"text":"\u30b3\u30a4\u30f3\u30e9\u30f3\u30c9\u30ea\u30fc\u3067\u3050\u308b\u3050\u308b\u56de\u308b\u6d17\u6fef\u7269\u3092\u3058\u3063\u3068\u898b\u3064\u3081\u308b\u304a\u3070\u3055\u3093\u306e\u76ee\u304c\u6016\u3044\u304f\u3089\u3044\u306b\u6b7b\u3093\u3067\u308b\u3002\u4f55\u304c\u3042\u3063\u305f\u3093\u3060\u3088\u304a\u3070\u3055\u3093\uff01","entities":{"urls":[],"hashtags":[],"user_mentions":[]},"id_str":"145726868841181186","place":null,"truncated":false,"contributors":null,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twicca.r246.jp\/\" rel=\"nofollow\"\u003Etwicca\u003C\/a\u003E","created_at":"Sun Dec 11 04:49:27 +0000 2011","geo":null,"retweet_count":0,"favorited":false,"coordinates":null,"in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"user":{"profile_use_background_image":true,"lang":"ja","favourites_count":117,"profile_text_color":"3E4415","id_str":"183252066","profile_background_image_url":"http:\/\/a1.twimg.com\/images\/themes\/theme5\/bg.gif","screen_name":"aka_navratilova","statuses_count":4267,"profile_link_color":"D02B55","description":"work to eat,eat to work.\r\n\u30b5\u30f3\u30c7\u30fc\u30b7\u30f3\u30ac\u30fc\u30bd\u30f3\u30b0\u30e9\u30a4\u30bf\u30fc\u3067\u3059\u300

    at org.apache.hadoop.hive.ql.exec.ExecMapper.map(ExecMapper.java:161)
    at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:405)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:336)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:396)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1109)
    at org.apache.hadoop.mapred.Child.main(Child.java:264)

Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing writable {"in_reply_to_user_id_str":null,"in_reply_to_status_id":null,"text":"\u30b3\u30a4\u30f3\u30e9\u30f3\u30c9\u30ea\u30fc\u3067\u3050\u308b\u3050\u308b\u56de\u308b\u6d17\u6fef\u7269\u3092\u3058\u3063\u3068\u898b\u3064\u3081\u308b\u304a\u3070\u3055\u3093\u306e\u76ee\u304c\u6016\u3044\u304f\u3089\u3044\u306b\u6b7b\u3093\u3067\u308b\u3002\u4f55\u304c\u3042\u3063\u305f\u3093\u3060\u3088\u304a\u3070\u3055\u3093\uff01","entities":{"urls":[],"hashtags":[],"user_mentions":[]},"id_str":"145726868841181186","place":null,"truncated":false,"contributors":null,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twicca.r246.jp\/\" rel=\"nofollow\"\u003Etwicca\u003C\/a\u003E","created_at":"Sun Dec 11 04:49:27 +0000 2011","geo":null,"retweet_count":0,"favorited":false,"coordinates":null,"in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"user":{"profile_use_background_image":true,"lang":"ja","favourites_count":117,"profile_text_color":"3E4415","id_str":"183252066","profile_background_image_url":"http:\/\/a1.twimg.com\/images\/themes\/theme5\/bg.gif","screen_name":"aka_navratilova","statuses_count":4267,"profile_link_color":"D02B55","description":"work to eat,eat to work.\r\n\u30b5\u30f3\u30c7\u30fc\u30b7\u30f3\u30ac\u30fc\u30bd\u30f3\u30b0\u30e9\u30a4\u30bf\u30fc\u3067\u3059\u300

    at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:524)
    at org.apache.hadoop.hive.ql.exec.ExecMapper.map(ExecMapper.java:143)
    ... 8 more
Caused by: java.lang.NumberFormatException: For input string: "300  "
    at java.lang.NumberFormatException.forInputString(NumberFormatException.java:48)
    at java.lang.Integer.parseInt(Integer.java:458)
    at org.openx.data.jsonserde.json.JSONTokener.nextString(JSONTokener.java:279)
    at org.openx.data.jsonserde.json.JSONTokener.nextValue(JSONTokener.java:359)
    at org.openx.data.jsonserde.json.JSONObject.<init>(JSONObject.java:208)
    at org.openx.data.jsonserde.json.JSONTokener.nextValue(JSONTokener.java:362)
    at org.openx.data.jsonserde.json.JSONObject.<init>(JSONObject.java:208)
    at org.openx.data.jsonserde.json.JSONObject.<init>(JSONObject.java:310)
    at org.openx.data.jsonserde.JsonSerDe$1.<init>(JsonSerDe.java:150)
    at org.openx.data.jsonserde.JsonSerDe.deserialize(JsonSerDe.java:150)
    at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:508)
    ... 9 more
rcongiu commented 11 years ago

Odd... but I don't see the string '300' in the log below, it looks truncated (\u300 seems truncated as it's a unicode character missing the 4th digit).

If it's possible (it may not be depending on the sensitivity of your data) could you send me by email the file that's causing the issue (or the smallest portion that causes the issue) to reproduce it ? rcongiu at yahoo dot com.

Oh, and just to check, the japanese tweets are stored in UTF-8 or in some other encoding ? 

R.

 

"Good judgment comes from experience.

Experience comes from bad judgment"

Data Engineer - OpenX.org Pasadena, CA Skype: sardodazione Y! IM: rcongiu


From: Douglas Moore notifications@github.com To: rcongiu/Hive-JSON-Serde Hive-JSON-Serde@noreply.github.com Sent: Wednesday, November 28, 2012 9:18 AM Subject: [Hive-JSON-Serde] Exception when parsing multi-byte twitter data (#18)

Good stuff here in this Serde, probably the best out there. I think we're hitting the same or similar problem parsing twitter data. The problem seems prevalent when parsing tweets with multi-byte characters, e.g. lang="jp" is common in the failures we're seeing. I'm still working on problem determination, though careful reading of the exception says that "300 " is not being parsed as an int. The only place in the data where the characters 300 occur is at the end of the description field and it's part of a multibyte string thats not even being projected. It would be great for us if the serde ignored JSON that causes parse exceptions rather than throw the exception. Many thanks in advance, Douglas CODE: CREATE EXTERNAL TABLE tweets_clean ( created_at STRING, lang STRING, entities STRUCT< urls:ARRAY, user_mentions:ARRAY, hashtags:ARRAY<STRUCT>> ) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' WITH SERDEPROPERTIES ( "ignore.malformed.json" = "true") LOCATION '/twitter-data/'; SELECT LOWER(hashtags.text) AS hashtag, unix_timestamp(created_at, "EEE MMM d HH:mm:ss Z yyyy")/900 AS timebucket, COUNT(*) AS total_count FROM tweets_clean LATERAL VIEW EXPLODE(entities.hashtags) t1 AS hashtags WHERE hashtags is not null AND created_at is not null GROUP BY LOWER(hashtags.text), unix_timestamp(created_at, "EEE MMM d HH:mm:ss Z yyyy")/900 ORDER BY total_count DESC LOG ENTRY: java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing writable {"in_reply_to_user_id_str":null,"in_reply_to_status_id":null,"text":"\u30b3\u30a4\u30f3\u30e9\u30f3\u30c9\u30ea\u30fc\u3067\u3050\u308b\u3050\u308b\u56de\u308b\u6d17\u6fef\u7269\u3092\u3058\u3063\u3068\u898b\u3064\u3081\u308b\u304a\u3070\u3055\u3093\u306e\u76ee\u304c\u6016\u3044\u304f\u3089\u3044\u306b\u6b7b\u3093\u3067\u308b\u3002\u4f55\u304c\u3042\u3063\u305f\u3093\u3060\u3088\u304a\u3070\u3055\u3093\uff01","entities":{"urls":[],"hashtags":[],"user_mentions":[]},"id_str":"145726868841181186","place":null,"truncated":false,"contributors":null,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twicca.r246.jp\/\" rel=\"nofollow\"\u003Etwicca\u003C\/a\u003E","created_at":"Sun Dec 11 04:49:27 +0000 2011","geo":null,"retweet_count":0,"favorited":false,"coordinates":null,"in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"user":{"profile _use_background_image":true,"lang":"ja","favourites_count":117,"profile_text_color":"3E4415","id_str":"183252066","profile_background_image_url":"http:\/\/a1.twimg.com\/images\/themes\/theme5\/bg.gif","screen_name":"aka_navratilova","statuses_count":4267,"profile_link_color":"D02B55","description":"work to eat,eat to work.\r\n\u30b5\u30f3\u30c7\u30fc\u30b7\u30f3\u30ac\u30fc\u30bd\u30f3\u30b0\u30e9\u30a4\u30bf\u30fc\u3067\u3059\u300 at org.apache.hadoop.hive.ql.exec.ExecMapper.map(ExecMapper.java:161) at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:50) at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:405) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:336) at org.apache.hadoop.mapred.Child$4.run(Child.java:270) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1109) at org.apache.hadoop.mapred.Child.main(Child.java:264) Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing writable {"in_reply_to_user_id_str":null,"in_reply_to_status_id":null,"text":"\u30b3\u30a4\u30f3\u30e9\u30f3\u30c9\u30ea\u30fc\u3067\u3050\u308b\u3050\u308b\u56de\u308b\u6d17\u6fef\u7269\u3092\u3058\u3063\u3068\u898b\u3064\u3081\u308b\u304a\u3070\u3055\u3093\u306e\u76ee\u304c\u6016\u3044\u304f\u3089\u3044\u306b\u6b7b\u3093\u3067\u308b\u3002\u4f55\u304c\u3042\u3063\u305f\u3093\u3060\u3088\u304a\u3070\u3055\u3093\uff01","entities":{"urls":[],"hashtags":[],"user_mentions":[]},"id_str":"145726868841181186","place":null,"truncated":false,"contributors":null,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twicca.r246.jp\/\" rel=\"nofollow\"\u003Etwicca\u003C\/a\u003E","created_at":"Sun Dec 11 04:49:27 +0000 2011","geo":null,"retweet_count":0,"favorited":false,"coordinates":null,"in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"user":{"profile_use_backgroun d_image":true,"lang":"ja","favourites_count":117,"profile_text_color":"3E4415","id_str":"183252066","profile_background_image_url":"http:\/\/a1.twimg.com\/images\/themes\/theme5\/bg.gif","screen_name":"aka_navratilova","statuses_count":4267,"profile_link_color":"D02B55","description":"work to eat,eat to work.\r\n\u30b5\u30f3\u30c7\u30fc\u30b7\u30f3\u30ac\u30fc\u30bd\u30f3\u30b0\u30e9\u30a4\u30bf\u30fc\u3067\u3059\u300 at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:524) at org.apache.hadoop.hive.ql.exec.ExecMapper.map(ExecMapper.java:143) ... 8 more Caused by: java.lang.NumberFormatException: For input string: "300 " at java.lang.NumberFormatException.forInputString(NumberFormatException.java:48) at java.lang.Integer.parseInt(Integer.java:458) at org.openx.data.jsonserde.json.JSONTokener.nextString(JSONTokener.java:279) at org.openx.data.jsonserde.json.JSONTokener.nextValue(JSONTokener.java:359) at org.openx.data.jsonserde.json.JSONObject.(JSONObject.java:208) at org.openx.data.jsonserde.json.JSONTokener.nextValue(JSONTokener.java:362) at org.openx.data.jsonserde.json.JSONObject.(JSONObject.java:208) at org.openx.data.jsonserde.json.JSONObject.(JSONObject.java:310) at org.openx.data.jsonserde.JsonSerDe$1.(JsonSerDe.java:150) at org.openx.data.jsonserde.JsonSerDe.deserialize(JsonSerDe.java:150) at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:508) ... 9 more — Reply to this email directly or view it on GitHub.

dmoore247 commented 11 years ago

We have 640 of these files so it may take a while to isolate the file(s) causing the issue. The data is kept in a plain text file.

dmoore247 commented 11 years ago

Turns out the data file is corrupted with what looks like download status info, so technically what we're sending in is not JSON but malformed junk....

{"in_reply_to_user_id_str":null,"in_reply_to_status_id":null,"text":"Don't send an angel to face the Devil...","entities":{"urls":[],"hashtags":[],"user_mentions":[]},"id_str":"145728382984925184","place":null,"truncated":false,"contributors":null,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twitterrific.com\" rel=\"nofollow\"\u003ETwitterrific\u003C\/a\u003E","created_at":"Sun Dec 11 04:55:28 +0000 2011","geo":null,"retweet_count":0,"favorited":false,"coordinates":null,"in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"user":{"profile_use_background_image":true,"lang":"en","favourites_count":4,"profile_text_color":"277869","id_str":"44374197","profile_background_image_url":"http:\/\/a3.twimg.com\/profile_background_images\/102269290\/MObackground.jpg","screen_name":"TheBisclavret","statuses_count":5596,"profile_link_color":"000000","description":"Photographer at Flip Out Photography. Head of photography for Love Like Rain. Anything else just ask.","is_translator":false,"follow_request_sent":null,"following":null,"geo_enabled":false,"profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/102269290\/MObackground.jpg","created_at":"Wed Jun 03 14:54:15 +0000 2009","friends_count":66,"listed_count":3,"verified":false,"profile_background_color":"693d24","show_all_inline_media":false,"profile_background_tile":false,"default_profile":false,"profile_sidebar_fill_color":"613529","followers_count":129,"protected":false,"url":"http:\/\/facebook.com\/flipoutphotography","profile_image_url":"http:\/\/a1.twimg.com\/profile_images\/1678055603\/b6b3832a1ec311e1abb01231381b65e3_7_normal.jpg","time_zone":"Central Time (US & Canada)","name":"Aaron Scott","default_profile_image":false,"contributors_enabled":false,"profile_sidebar_border_color":"ad9f56","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1678055603\/b6b3832a1ec311e1abb01231381b65e3_7_normal.jpg","id":44374197,"notifications":null,"utc_offset":-21600,"location":"Arkansas"},"id":145728382984925184,"retweeted":false} {"in_reply_to_user_id_str":null,"in_reply_to_status_id":null,"text":"I need a drink, or two.","entities":{"urls":[],"hashtags":[],"user_mentions":[]},"id_str":"145728382959747073","place":null,"truncated":false,"contributors":null,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twitter.com\/#!\/download\/iphone\" rel=\"nofollow\"\u003ETwitter for iPhone\u003 100 98.3M 0 98.3M 0 0 65130 0 --:--:-- 0:26:23 --:--:-- 57146C\/a\u003E","created_at":"Sun Dec 11 04:55:28 +0000 2011","geo":null,"retweet_count":0,"favorited":false,"coordinates":null,"in_reply_to_screen_name":null,"in_reply_to_status_id_str":null,"user":{"profile_use_background_image":true,"lang":"en","favourites_count":6,"profile_text_color":"0a0909","id_str":"48191422","profile_background_image_url":"http:\/\/a1.twimg.com\/profile_background_images\/159110741\/IMG_0101.jpg","screen_name":"_BIGGS17","default_profile":false,"statuses_count":13866,"profile_link_color":"B40B43","description":"one of one. none before, none to come. ","is_translator":false,"follow_request_sent":null,"following":null,"geo_enabled":true,"profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/159110741\/IMG_0101.jpg","created_at":"Thu Jun 18 01:13:29 +0000 2009","friends_count":86,"listed_count":5,"verified":false,"profile_background_color":"FF6699","show_all_inline_media":true,"profile_background_tile":true,"profile_sidebar_fill_color":"474343","followers_count":279,"protected":false,"url":null,"profile_image_url":"http:\/\/a2.twimg.com\/profile_images\/1660058347\/image_normal.jpg","time_zone":"Eastern Time (US & Canada)","name":"Kat","default_profile_image":false,"contributors_enabled":false,"profile_sidebar_border_color":"faebec","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1660058347\/image_normal.jpg","id":48191422,"notifications":null,"utc_offset":-18000,"location":"NYC"},"id":145728382959747073,"retweeted":false}

rcongiu commented 11 years ago

Ok... so the data was corrupted after all. I'll close the issue then.