Open tmerien opened 3 years ago
+1 for the dockerized scenario
docker-compose run -v $PWD/twint:/opt/app/data twint -s AAPL -es 172.19.0.2:9200 --since 2020-11-01
[+] Indexing to Elasticsearch @ 172.19.0.2:9200
.......Traceback (most recent call last):
File "/usr/local/bin/twint", line 11, in <module>
load_entry_point('twint', 'console_scripts', 'twint')()
File "/src/twint/twint/cli.py", line 339, in run_as_command
main()
File "/src/twint/twint/cli.py", line 330, in main
run.Search(c)
File "/src/twint/twint/run.py", line 410, in Search
run(config, callback)
File "/src/twint/twint/run.py", line 329, in run
get_event_loop().run_until_complete(Twint(config).main(callback))
File "/usr/lib/python3.7/asyncio/base_events.py", line 579, in run_until_complete
return future.result()
File "/src/twint/twint/run.py", line 235, in main
await task
File "/src/twint/twint/run.py", line 286, in run
await self.tweets()
File "/src/twint/twint/run.py", line 226, in tweets
await output.Tweets(tweet, self.config, self.conn)
File "/src/twint/twint/output.py", line 166, in Tweets
await checkData(tweets, config, conn)
File "/src/twint/twint/output.py", line 152, in checkData
elasticsearch.Tweet(tweet, config)
File "/src/twint/twint/storage/elasticsearch.py", line 293, in Tweet
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/helpers/actions.py", line 390, in bulk
for ok, item in streaming_bulk(client, actions, *args, **kwargs):
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/helpers/actions.py", line 320, in streaming_bulk
**kwargs
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/helpers/actions.py", line 247, in _process_bulk_chunk
for item in gen:
File "/usr/local/lib/python3.7/dist-packages/elasticsearch/helpers/actions.py", line 188, in _process_bulk_chunk_success
raise BulkIndexError("%i document(s) failed to index." % len(errors), errors)
The mapping, either created by twint
explicitly, or implicitly and dynamically when indexing a tweet, is not adequate for two reasons:
mentions
is a nested object of screen_name
, name
and id
, not just text
.place
is either a text
or a nested object of coordinates
and type
, which elasticsearch can't really handle well.Therefore I did a quick fix REMOVING and re-creating the index with a slightly different mapping which doesn't dynamically add new fields, like the inconsistent place
("dynamic": false
) and creates a nested mentions
.
This however:
If you know what you're doing, replace FRANK, SINATRA and JUNIOR with DELETE, PUT and PUT respectively.
curl -XFRANK "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json'
curl -XSINATRA "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' -d '
{
"settings": {
"analysis": {
"normalizer": {
"hashtag_normalizer": {
"filter": [
"lowercase",
"asciifolding"
],
"type": "custom",
"char_filter": []
}
}
}
}
}'
curl -XJUNIOR "localhost:9200/twinttweets/_mapping?pretty" -H 'Content-Type: application/json' -d'
{
"dynamic": false,
"properties": {
"cashtags": {
"type": "keyword",
"normalizer": "hashtag_normalizer"
},
"conversation_id": {
"type": "long"
},
"created_at": {
"type": "text"
},
"date": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"day": {
"type": "integer"
},
"essid": {
"type": "keyword"
},
"geo_near": {
"type": "geo_point"
},
"geo_tweet": {
"type": "geo_point"
},
"hashtags": {
"type": "keyword",
"normalizer": "hashtag_normalizer"
},
"hour": {
"type": "integer"
},
"id": {
"type": "long"
},
"lang": {
"type": "keyword"
},
"language": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"link": {
"type": "text"
},
"location": {
"type": "keyword"
},
"mentions": {
"type": "nested",
"properties": {
"id": {
"type": "long"
},
"name": {
"type": "text"
},
"screen_name": {
"type": "text"
}
}
},
"name": {
"type": "text"
},
"near": {
"type": "text"
},
"nlikes": {
"type": "integer"
},
"nreplies": {
"type": "integer"
},
"nretweets": {
"type": "integer"
},
"photos": {
"type": "text"
},
"profile_image_url": {
"type": "text"
},
"quote_url": {
"type": "text"
},
"reply_to": {
"type": "nested",
"properties": {
"id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"screen_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"user_id": {
"type": "keyword"
},
"username": {
"type": "keyword"
}
}
},
"retweet": {
"type": "text"
},
"retweet_date": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss",
"ignore_malformed": true
},
"retweet_id": {
"type": "keyword"
},
"search": {
"type": "text"
},
"source": {
"type": "keyword"
},
"thumbnail": {
"type": "text"
},
"timezone": {
"type": "keyword"
},
"trans_dest": {
"type": "keyword"
},
"trans_src": {
"type": "keyword"
},
"translate": {
"type": "text"
},
"tweet": {
"type": "text"
},
"urls": {
"type": "keyword"
},
"user_id_str": {
"type": "keyword"
},
"user_rt": {
"type": "keyword"
},
"user_rt_id": {
"type": "keyword"
},
"username": {
"type": "keyword",
"normalizer": "hashtag_normalizer"
},
"video": {
"type": "integer"
}
}
}'
@prhbrt I came to the same conclusion, and created a fix in a branch instead
I am new to this code base, but IMO also https://github.com/twintproject/twint/blob/master/twint/storage/elasticsearch.py#L219 needs to be removed to make it work, besides, place information is processed at https://github.com/twintproject/twint/blob/master/twint/storage/elasticsearch.py#L279 and put in a different field altogether. Seems place is not used at all, but it might be wrong here
Guys, Thank you for answering. I'm going to check this tomorrow morning !
Thanks for helping
The mapping, either created by
twint
explicitly, or implicitly and dynamically when indexing a tweet, is not adequate for two reasons:
mentions
is a nested object ofscreen_name
,name
andid
, not justtext
.place
is either atext
or a nested object ofcoordinates
andtype
, which elasticsearch can't really handle well.Therefore I did a quick fix REMOVING and re-creating the index with a slightly different mapping which doesn't dynamically add new fields, like the inconsistent
place
("dynamic": false
) and creates a nestedmentions
.This however:
- deletes all tweets, consider reindexing if you don't want to, and
- never saves the place or other fields not specified in the mapping below.
If you know what you're doing, replace FRANK, SINATRA and JUNIOR with DELETE, PUT and PUT respectively.
curl -XFRANK "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' curl -XSINATRA "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' -d ' { "settings": { "analysis": { "normalizer": { "hashtag_normalizer": { "filter": [ "lowercase", "asciifolding" ], "type": "custom", "char_filter": [] } } } } }' curl -XJUNIOR "localhost:9200/twinttweets/_mapping?pretty" -H 'Content-Type: application/json' -d' { "dynamic": false, "properties": { "cashtags": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "conversation_id": { "type": "long" }, "created_at": { "type": "text" }, "date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss" }, "day": { "type": "integer" }, "essid": { "type": "keyword" }, "geo_near": { "type": "geo_point" }, "geo_tweet": { "type": "geo_point" }, "hashtags": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "hour": { "type": "integer" }, "id": { "type": "long" }, "lang": { "type": "keyword" }, "language": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "link": { "type": "text" }, "location": { "type": "keyword" }, "mentions": { "type": "nested", "properties": { "id": { "type": "long" }, "name": { "type": "text" }, "screen_name": { "type": "text" } } }, "name": { "type": "text" }, "near": { "type": "text" }, "nlikes": { "type": "integer" }, "nreplies": { "type": "integer" }, "nretweets": { "type": "integer" }, "photos": { "type": "text" }, "profile_image_url": { "type": "text" }, "quote_url": { "type": "text" }, "reply_to": { "type": "nested", "properties": { "id": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "screen_name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "user_id": { "type": "keyword" }, "username": { "type": "keyword" } } }, "retweet": { "type": "text" }, "retweet_date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": true }, "retweet_id": { "type": "keyword" }, "search": { "type": "text" }, "source": { "type": "keyword" }, "thumbnail": { "type": "text" }, "timezone": { "type": "keyword" }, "trans_dest": { "type": "keyword" }, "trans_src": { "type": "keyword" }, "translate": { "type": "text" }, "tweet": { "type": "text" }, "urls": { "type": "keyword" }, "user_id_str": { "type": "keyword" }, "user_rt": { "type": "keyword" }, "user_rt_id": { "type": "keyword" }, "username": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "video": { "type": "integer" } } }'
Which files in twint to do you make these edits too?
The mapping, either created by
twint
explicitly, or implicitly and dynamically when indexing a tweet, is not adequate for two reasons:
mentions
is a nested object ofscreen_name
,name
andid
, not justtext
.place
is either atext
or a nested object ofcoordinates
andtype
, which elasticsearch can't really handle well.Therefore I did a quick fix REMOVING and re-creating the index with a slightly different mapping which doesn't dynamically add new fields, like the inconsistent
place
("dynamic": false
) and creates a nestedmentions
.This however:
- deletes all tweets, consider reindexing if you don't want to, and
- never saves the place or other fields not specified in the mapping below.
If you know what you're doing, replace FRANK, SINATRA and JUNIOR with DELETE, PUT and PUT respectively.
curl -XFRANK "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' curl -XSINATRA "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' -d ' { "settings": { "analysis": { "normalizer": { "hashtag_normalizer": { "filter": [ "lowercase", "asciifolding" ], "type": "custom", "char_filter": [] } } } } }' curl -XJUNIOR "localhost:9200/twinttweets/_mapping?pretty" -H 'Content-Type: application/json' -d' { "dynamic": false, "properties": { "cashtags": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "conversation_id": { "type": "long" }, "created_at": { "type": "text" }, "date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss" }, "day": { "type": "integer" }, "essid": { "type": "keyword" }, "geo_near": { "type": "geo_point" }, "geo_tweet": { "type": "geo_point" }, "hashtags": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "hour": { "type": "integer" }, "id": { "type": "long" }, "lang": { "type": "keyword" }, "language": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "link": { "type": "text" }, "location": { "type": "keyword" }, "mentions": { "type": "nested", "properties": { "id": { "type": "long" }, "name": { "type": "text" }, "screen_name": { "type": "text" } } }, "name": { "type": "text" }, "near": { "type": "text" }, "nlikes": { "type": "integer" }, "nreplies": { "type": "integer" }, "nretweets": { "type": "integer" }, "photos": { "type": "text" }, "profile_image_url": { "type": "text" }, "quote_url": { "type": "text" }, "reply_to": { "type": "nested", "properties": { "id": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "screen_name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "user_id": { "type": "keyword" }, "username": { "type": "keyword" } } }, "retweet": { "type": "text" }, "retweet_date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": true }, "retweet_id": { "type": "keyword" }, "search": { "type": "text" }, "source": { "type": "keyword" }, "thumbnail": { "type": "text" }, "timezone": { "type": "keyword" }, "trans_dest": { "type": "keyword" }, "trans_src": { "type": "keyword" }, "translate": { "type": "text" }, "tweet": { "type": "text" }, "urls": { "type": "keyword" }, "user_id_str": { "type": "keyword" }, "user_rt": { "type": "keyword" }, "user_rt_id": { "type": "keyword" }, "username": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "video": { "type": "integer" } } }' `` DELETE and the first PUT works. However, when I run the second PUT I get this error:
{ "error" : { "root_cause" : [ { "type" : "illegal_argument_exception", "reason" : "can't merge a non object mapping [mentions] with an object mapping" } ], "type" : "illegal_argument_exception", "reason" : "can't merge a non object mapping [mentions] with an object mapping" }, "status" : 400 }
On running the curl command :
curl -d "{"settings":{"analysis":{"normalizer":{"hashtag_normalizer":{"filter":["lowercase","asciifolding"],"type":"custom","char_filter":[]}}}}}" -X PUT -H 'Content-Type:application/json' "localhost:9200/twinttweets?pretty"
I get the following error :
{
"error" : "Content-Type header [application/x-www-form-urlencoded] is not supported",
"status" : 406
}
Is this a bug in the target API?
P.S I tried using the solution provided by @prhbrt but on the cli (windows cmd) it was giving me the same error as above, hence I changed the json to the above.
The mapping, either created by
twint
explicitly, or implicitly and dynamically when indexing a tweet, is not adequate for two reasons:
mentions
is a nested object ofscreen_name
,name
andid
, not justtext
.place
is either atext
or a nested object ofcoordinates
andtype
, which elasticsearch can't really handle well.Therefore I did a quick fix REMOVING and re-creating the index with a slightly different mapping which doesn't dynamically add new fields, like the inconsistent
place
("dynamic": false
) and creates a nestedmentions
.This however:
- deletes all tweets, consider reindexing if you don't want to, and
- never saves the place or other fields not specified in the mapping below.
If you know what you're doing, replace FRANK, SINATRA and JUNIOR with DELETE, PUT and PUT respectively.
curl -XFRANK "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' curl -XSINATRA "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' -d ' { "settings": { "analysis": { "normalizer": { "hashtag_normalizer": { "filter": [ "lowercase", "asciifolding" ], "type": "custom", "char_filter": [] } } } } }' curl -XJUNIOR "localhost:9200/twinttweets/_mapping?pretty" -H 'Content-Type: application/json' -d' { "dynamic": false, "properties": { "cashtags": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "conversation_id": { "type": "long" }, "created_at": { "type": "text" }, "date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss" }, "day": { "type": "integer" }, "essid": { "type": "keyword" }, "geo_near": { "type": "geo_point" }, "geo_tweet": { "type": "geo_point" }, "hashtags": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "hour": { "type": "integer" }, "id": { "type": "long" }, "lang": { "type": "keyword" }, "language": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "link": { "type": "text" }, "location": { "type": "keyword" }, "mentions": { "type": "nested", "properties": { "id": { "type": "long" }, "name": { "type": "text" }, "screen_name": { "type": "text" } } }, "name": { "type": "text" }, "near": { "type": "text" }, "nlikes": { "type": "integer" }, "nreplies": { "type": "integer" }, "nretweets": { "type": "integer" }, "photos": { "type": "text" }, "profile_image_url": { "type": "text" }, "quote_url": { "type": "text" }, "reply_to": { "type": "nested", "properties": { "id": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "screen_name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "user_id": { "type": "keyword" }, "username": { "type": "keyword" } } }, "retweet": { "type": "text" }, "retweet_date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": true }, "retweet_id": { "type": "keyword" }, "search": { "type": "text" }, "source": { "type": "keyword" }, "thumbnail": { "type": "text" }, "timezone": { "type": "keyword" }, "trans_dest": { "type": "keyword" }, "trans_src": { "type": "keyword" }, "translate": { "type": "text" }, "tweet": { "type": "text" }, "urls": { "type": "keyword" }, "user_id_str": { "type": "keyword" }, "user_rt": { "type": "keyword" }, "user_rt_id": { "type": "keyword" }, "username": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "video": { "type": "integer" } } }'
This is my first post on github, i'd like to say thank you.
The mapping, either created by
twint
explicitly, or implicitly and dynamically when indexing a tweet, is not adequate for two reasons:* `mentions` is a nested object of `screen_name`, `name` and `id`, not just `text`. * `place` is either a `text` or a nested object of `coordinates` and `type`, which elasticsearch can't really handle well.
Therefore I did a quick fix REMOVING and re-creating the index with a slightly different mapping which doesn't dynamically add new fields, like the inconsistent
place
("dynamic": false
) and creates a nestedmentions
.This however:
* **deletes all tweets**, consider reindexing if you don't want to, and * never saves the place or other fields not specified in the mapping below.
If you know what you're doing, replace FRANK, SINATRA and JUNIOR with DELETE, PUT and PUT respectively.
curl -XFRANK "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' curl -XSINATRA "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' -d ' { "settings": { "analysis": { "normalizer": { "hashtag_normalizer": { "filter": [ "lowercase", "asciifolding" ], "type": "custom", "char_filter": [] } } } } }' curl -XJUNIOR "localhost:9200/twinttweets/_mapping?pretty" -H 'Content-Type: application/json' -d' { "dynamic": false, "properties": { "cashtags": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "conversation_id": { "type": "long" }, "created_at": { "type": "text" }, "date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss" }, "day": { "type": "integer" }, "essid": { "type": "keyword" }, "geo_near": { "type": "geo_point" }, "geo_tweet": { "type": "geo_point" }, "hashtags": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "hour": { "type": "integer" }, "id": { "type": "long" }, "lang": { "type": "keyword" }, "language": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "link": { "type": "text" }, "location": { "type": "keyword" }, "mentions": { "type": "nested", "properties": { "id": { "type": "long" }, "name": { "type": "text" }, "screen_name": { "type": "text" } } }, "name": { "type": "text" }, "near": { "type": "text" }, "nlikes": { "type": "integer" }, "nreplies": { "type": "integer" }, "nretweets": { "type": "integer" }, "photos": { "type": "text" }, "profile_image_url": { "type": "text" }, "quote_url": { "type": "text" }, "reply_to": { "type": "nested", "properties": { "id": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "screen_name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "user_id": { "type": "keyword" }, "username": { "type": "keyword" } } }, "retweet": { "type": "text" }, "retweet_date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": true }, "retweet_id": { "type": "keyword" }, "search": { "type": "text" }, "source": { "type": "keyword" }, "thumbnail": { "type": "text" }, "timezone": { "type": "keyword" }, "trans_dest": { "type": "keyword" }, "trans_src": { "type": "keyword" }, "translate": { "type": "text" }, "tweet": { "type": "text" }, "urls": { "type": "keyword" }, "user_id_str": { "type": "keyword" }, "user_rt": { "type": "keyword" }, "user_rt_id": { "type": "keyword" }, "username": { "type": "keyword", "normalizer": "hashtag_normalizer" }, "video": { "type": "integer" } } }'
DELETE work. However, when I run the first PUT I get this error: { "error" : "Incorrect HTTP method for uri [/twinttweets?pretty] and method [POST], allowed: [GET, HEAD, PUT, DELETE]", "status" : 405 }
$ curl -PUT "localhost:9200/twinttweets?pretty" -H 'Content-Type: application/json' -d '
{ "settings": { "analysis": { "normalizer": { "hashtag_normalizer": { "filter": [ "lowercase", "asciifolding" ], "type": "custom", "char_filter": [] } } } } }'
This helped! Thank you
Good morning,
Here you will find my issue, ask me if you need more information and how i can help to solve the issue.
Command Ran
twint -s "#covid19" -es localhost:9200
Description of Issue
1. The command line using to generate the error 2. Secondly : After installing ES, Kibana and importing the vizualisation dashboard, i can see tweet number info, the right windows with username of twitters and the heatmap of tweet time. In other windows the error is : Could not locate that index-pattern-field (id: user_id)
Environment Details
I'm actually using ubuntu 20 and firefox explorer.
Thank you for reading, Thibaud