def get_chatgpt_data(conv_arr):
for conv in conv_arr:
# Initialize the messages list for each conversation
conv['messages'] = []
for k, v in conv['mapping'].items():
obj = {}
message = v['message']
if message:
metadata = message['metadata']
role = message['author']['role']
create_time = datetime.fromtimestamp(message['create_time']).strftime('%Y-%m-%d %H:%M:%S') if message['create_time'] else None
update_time = datetime.fromtimestamp(message['update_time']).strftime('%Y-%m-%d %H:%M:%S') if message['update_time'] else None
content = message['content']
obj['role'] = role
obj['insert_dt'] = create_time
obj['update_dt'] = update_time
if role == 'user':
# Handle the parts of user content
content_parts = []
for part in content['parts']:
if isinstance(part, str):
content_parts.append(part)
elif isinstance(part, dict):
print(f"Encountered a dict in user content parts: {part}")
# Handle dict appropriately
# Example: content_parts.append(str(part)) or another approach
content_parts.append(str(part)) # Convert dict to string for now
print (json.dumps(part, sort_keys=True, indent=4))
else:
print(f"Unknown content type in user content parts: {type(part)}")
obj['content'] = '\n'.join(content_parts)
conv['messages'].append(obj)
else:
if role == 'tool':
# Skip tool role as it's for internal use only
continue
elif role == 'assistant':
model_slug = metadata.get('model_slug', None)
obj['model'] = model_slug
content_type = content['content_type']
if content_type == 'text':
# Handle the parts of assistant's text content
content_parts = []
for part in content['parts']:
if isinstance(part, str):
content_parts.append(part)
elif isinstance(part, dict):
print(f"Encountered a dict in assistant content parts: {part}")
# Handle dict appropriately
# Example: content_parts.append(str(part)) or another approach
content_parts.append(str(part)) # Convert dict to string for now
print(json.dumps(part, sort_keys=True, indent=4))
else:
print(f"Unknown content type in assistant content parts: {type(part)}")
obj['content'] = '\n'.join(content_parts)
conv['messages'].append(obj)
elif content_type == 'code':
# Handle code content type appropriately if needed
pass
elif role == 'system':
# Skip system role
continue
# Remove mapping key from the conversation
del conv['mapping']
return conv_arr
I was wrong in my asumption that it was when Dall-e generated images. This structure appear in the user side of the chat when files have been uploaded.
Type of content found in a dict instead of the expected string:
{ "asset_pointer": "file-service://file-XXXXXXXXXXXXXXXXXXXXXXXX", "content_type": "image_asset_pointer", "fovea": null, "height": 482, "metadata": { "dalle": null, "gizmo": null, "sanitized": true }, "size_bytes": 26664, "width": 802 }
Revised function that work from util script.py