X-lab2017 / open-digger

Open source analysis tools
https://open-digger.cn
Apache License 2.0
291 stars 86 forks source link

[OpenSODA] How to import data into Clickhouse #1280

Closed jhzhou002 closed 1 year ago

jhzhou002 commented 1 year ago

Usage

Here there is no table structure after the JSON data is decompressed, and you want to import it to ClickHouse, how to import it.

Extract SQL

https://xlab2017.yuque.com/staff-kbz9wp/olpzth/tq36xvyzg9b880hy?singleDoc#UKotT Here the data import in the URL click house is a direct import of the data table or how. image

Does this dataset need to be updated regularly?

No response

xgdyp commented 1 year ago

Good question. Actually we use clickhouse only for sample-data before. As for csv-format data, I'm not sure. @bifenglin can you help answer this question?

xgdyp commented 1 year ago
-- auto-generated definition
create table gh_events
(
    id                                     UInt64,
    type                                   Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'WatchEvent' = 14, 'PullRequestReviewEvent' = 15),
    action                                 Enum8('added' = 1, 'closed' = 2, 'created' = 3, 'labeled' = 4, 'opened' = 5, 'published' = 6, 'reopened' = 7, 'started' = 8),
    actor_id                               UInt64,
    actor_login                            LowCardinality(String),
    repo_id                                UInt64,
    repo_name                              LowCardinality(String),
    org_id                                 UInt64,
    org_login                              LowCardinality(String),
    created_at                             DateTime,
    issue_id                               UInt64,
    issue_number                           UInt32,
    issue_title                            String,
    body                                   String,
    `issue_labels.name`                    Array(String),
    `issue_labels.color`                   Array(String),
    `issue_labels.default`                 Array(UInt8),
    `issue_labels.description`             Array(String),
    issue_author_id                        UInt64,
    issue_author_login                     LowCardinality(String),
    issue_author_type                      Enum8('Bot' = 1, 'Mannequin' = 2, 'Organization' = 3, 'User' = 4),
    issue_author_association               Enum8('COLLABORATOR' = 1, 'CONTRIBUTOR' = 2, 'MEMBER' = 3, 'NONE' = 4, 'OWNER' = 5, 'MANNEQUIN' = 6),
    issue_assignee_id                      UInt64,
    issue_assignee_login                   LowCardinality(String),
    `issue_assignees.login`                Array(LowCardinality(String)),
    `issue_assignees.id`                   Array(UInt64),
    issue_created_at                       Nullable(DateTime),
    issue_updated_at                       Nullable(DateTime),
    issue_comments                         UInt16,
    issue_closed_at                        Nullable(DateTime),
    issue_comment_id                       UInt64,
    issue_comment_created_at               Nullable(DateTime),
    issue_comment_updated_at               Nullable(DateTime),
    issue_comment_author_association       Enum8('COLLABORATOR' = 1, 'CONTRIBUTOR' = 2, 'MEMBER' = 3, 'NONE' = 4, 'OWNER' = 5, 'MANNEQUIN' = 6),
    issue_comment_author_id                UInt64,
    issue_comment_author_login             LowCardinality(String),
    issue_comment_author_type              Enum8('Bot' = 1, 'Mannequin' = 2, 'Organization' = 3, 'User' = 4),
    pull_commits                           UInt16,
    pull_additions                         UInt16,
    pull_deletions                         UInt16,
    pull_changed_files                     UInt32,
    pull_merged                            UInt8,
    pull_merge_commit_sha                  String,
    pull_merged_at                         Nullable(DateTime),
    pull_merged_by_id                      UInt64,
    pull_merged_by_login                   LowCardinality(String),
    pull_merged_by_type                    Enum8('Bot' = 1, 'Mannequin' = 2, 'Organization' = 3, 'User' = 4),
    pull_requested_reviewer_id             UInt64,
    pull_requested_reviewer_login          LowCardinality(String),
    pull_requested_reviewer_type           Enum8('Bot' = 1, 'Mannequin' = 2, 'Organization' = 3, 'User' = 4),
    pull_review_comments                   UInt16,
    repo_description                       String,
    repo_size                              UInt32,
    repo_stargazers_count                  UInt32,
    repo_forks_count                       UInt32,
    repo_language                          String,
    repo_has_issues                        UInt8,
    repo_has_projects                      UInt8,
    repo_has_downloads                     UInt8,
    repo_has_wiki                          UInt8,
    repo_has_pages                         UInt8,
    repo_license                           String,
    repo_default_branch                    String,
    repo_created_at                        Nullable(DateTime),
    repo_updated_at                        Nullable(DateTime),
    repo_pushed_at                         Nullable(DateTime),
    pull_review_id                         UInt64,
    pull_review_comment_id                 UInt64,
    pull_review_comment_path               String,
    pull_review_comment_position           String,
    pull_review_comment_author_id          UInt64,
    pull_review_comment_author_login       LowCardinality(String),
    pull_review_comment_author_type        Enum8('Bot' = 1, 'Mannequin' = 2, 'Organization' = 3, 'User' = 4),
    pull_review_comment_author_association Enum8('COLLABORATOR' = 1, 'CONTRIBUTOR' = 2, 'MEMBER' = 3, 'NONE' = 4, 'OWNER' = 5, 'MANNEQUIN' = 6),
    pull_review_comment_created_at         Nullable(DateTime),
    pull_review_comment_updated_at         Nullable(DateTime),
    push_id                                UInt64,
    push_size                              UInt32,
    push_distinct_size                     UInt32,
    push_ref                               String,
    push_head                              String,
    `push_commits.name`                    Array(LowCardinality(String)),
    `push_commits.email`                   Array(String),
    `push_commits.message`                 Array(String),
    fork_forkee_id                         UInt64,
    fork_forkee_full_name                  LowCardinality(String),
    fork_forkee_owner_id                   UInt64,
    fork_forkee_owner_login                LowCardinality(String),
    fork_forkee_owner_type                 Enum8('Bot' = 1, 'Mannequin' = 2, 'Organization' = 3, 'User' = 4),
    delete_ref                             String,
    delete_ref_type                        String,
    delete_pusher_type                     Enum8('deploy_key' = 1, 'user' = 2),
    create_ref                             String,
    create_ref_type                        Enum8('branch' = 1, 'tag' = 2),
    create_master_branch                   String,
    create_description                     String,
    create_pusher_type                     Enum8('deploy_key' = 1, 'user' = 2),
    `gollum_pages.page_name`               Array(String),
    `gollum_pages.title`                   Array(String),
    `gollum_pages.action`                  Array(String),
    member_id                              UInt64,
    member_login                           LowCardinality(String),
    member_type                            Enum8('Bot' = 1, 'Mannequin' = 2, 'Organization' = 3, 'User' = 4),
    release_id                             UInt64,
    release_tag_name                       String,
    release_target_commitish               String,
    release_name                           String,
    release_draft                          UInt8,
    release_author_id                      UInt64,
    release_author_login                   LowCardinality(String),
    release_author_type                    Enum8('Bot' = 1, 'Mannequin' = 2, 'Organization' = 3, 'User' = 4),
    release_prerelease                     UInt8,
    release_created_at                     Nullable(DateTime),
    release_published_at                   Nullable(DateTime),
    release_body                           String,
    `release_assets.name`                  Array(String),
    `release_assets.uploader_login`        Array(LowCardinality(String)),
    `release_assets.uploader_id`           Array(UInt64),
    `release_assets.content_type`          Array(LowCardinality(String)),
    `release_assets.state`                 Array(String),
    `release_assets.size`                  Array(UInt64),
    `release_assets.download_count`        Array(UInt16),
    commit_comment_id                      UInt64,
    commit_comment_author_id               UInt64,
    commit_comment_author_login            LowCardinality(String),
    commit_comment_author_type             Enum8('Bot' = 1, 'Mannequin' = 2, 'Organization' = 3, 'User' = 4),
    commit_comment_author_association      Enum8('COLLABORATOR' = 1, 'CONTRIBUTOR' = 2, 'MEMBER' = 3, 'NONE' = 4, 'OWNER' = 5, 'MANNEQUIN' = 6),
    commit_comment_path                    String,
    commit_comment_position                String,
    commit_comment_line                    String,
    commit_comment_created_at              Nullable(DateTime),
    commit_comment_updated_at              Nullable(DateTime)
)
    engine = ReplacingMergeTree(id)
        PARTITION BY toYYYYMM(created_at)
        ORDER BY (type, repo_name, created_at)
        SETTINGS index_granularity = 8192;

@318352733 hi there are DDL of opensource table. You can try create table first and import it .

jhzhou002 commented 1 year ago

image And this JSON data. @xgdyp

xgdyp commented 1 year ago

hi, JSON data can not import to clickhouse now.