yihui / mime

Map filenames to MIME types
https://cran.rstudio.com/package=mime
32 stars 15 forks source link

Reverse guess #6

Open jeroen opened 8 years ago

jeroen commented 8 years ago

I need a way to make a determine a file extension for saving a file from the web to disk (similar to 'save as' in a browser). The problem is that mimemap has multiple options for several mime types.

> which(mimemap == "text/plain")
 asc  txt text  pot  brf  srt 
 452  453  454  455  456  457 

Is there some way to include a preferred extension for a given type? E.g. in this case .txt would probably be the best choice.

jeroen commented 8 years ago

Basically this: http://www.freeformatter.com/mime-types-list.html

yihui commented 8 years ago

No, there is currently no way to do that in this package. There has to be a database stored in the package like the page you referred to. If you can make sure that page can be copied and remixed into this package, a pull request will be welcome :)

muschellij2 commented 5 years ago

I can set up data-raw with the following code to map that data (gists are not working):

library(rvest)
library(dplyr)
library(tidyr)

url = "https://www.freeformatter.com/mime-types-list.html"
doc = read_html(url)

#############################
# Read in the table
#############################
tab = html_table(doc)
stopifnot(length(tab) == 1)
tab = tab[[1]]
tab = tab %>% 
  rename(name = Name,
         mime_type = `MIME Type / Internet Media Type`,
         ext = `File Extension`,
         details = `More Details`) %>% 
  mutate(
    mime_type = sub(",$", "", mime_type),
    ext = ifelse(ext %in% "N/A", NA, ext)
  ) 
n_missing = sum(is.na(tab$mime_type))
stopifnot(n_missing == 0)

#################################
# Few cases with multiple extensions
#################################
n_ext = strsplit(tab$ext, split = ",")
n_ext = sapply(n_ext, length)
tab$n_ext = n_ext
stopifnot(max(n_ext) == 2)
tab = tab %>% 
  separate(ext, into = c("ext_1", "ext_2"), 
           sep = ", ", fill = "right")
tab = tab %>% 
  gather(key = ext_number, value = ext, ext_1, ext_2) %>% 
  mutate(ext_number = sub("ext_", "", ext_number),
         ext_number = as.numeric(ext_number)
         ) %>% 
  filter(ext_number <= n_ext) %>% 
  arrange(mime_type, ext_number)
tab = tab %>% 
  select(-ext_number)
# test case
tab[ grepl("atom", tab$mime_type),]
tab[ tab$n_ext > 1,]

#################################
# Few cases with multiple mime types
#################################
n_mime = strsplit(tab$mime_type, split = ",")
n_mime = sapply(n_mime, length)
tab$n_mime = n_mime
stopifnot(max(n_mime) == 2)
tab = tab %>% 
  separate(mime_type, into = c("mime_1", "mime_2"), 
           sep = ",", fill = "right")
tab = tab %>% 
  gather(key = mime_number, value = mime_type, mime_1, mime_2) %>% 
  mutate(mime_number = sub("mime_", "", mime_number),
         mime_number = as.numeric(mime_number)
  ) %>% 
  filter(mime_number <= n_mime) %>% 
  arrange(mime_type, mime_number)
tab = tab %>% 
  select(-mime_number, -n_mime)

# test case
tab[ grepl("java", tab$mime_type),]

tab[ is.na(tab$mime_type),]

tab = tab %>% 
  filter(!is.na(mime_type),
         !is.na(ext),
         !ext %in% "")

tab = tab %>% 
  select(mime_type, ext)
first_tab = tab

url = "https://www.sitepoint.com/mime-types-complete-list/"
doc = read_html(url)

#############################
# Read in the table
#############################
tab = html_table(doc)
stopifnot(length(tab) == 1)
tab = tab[[1]]
tab = tab %>% 
  rename(ext = `Suffixes applicable`,
         mime_type = `Media type and subtype(s)`)
have_spaces = grepl(" ", tab$mime_type)
stopifnot(sum(have_spaces) == 2)
tab = tab %>% 
  mutate(
    mime_type = sub(" .*", "", mime_type),
    ext = ifelse(ext %in% "N/A", NA, ext)
  ) 
n_missing = sum(is.na(tab$mime_type))
stopifnot(n_missing == 0)

tab = full_join(tab, first_tab)
# both_tab = 

####################################
# Another set of types
####################################
url = "https://raw.githubusercontent.com/hoaproject/Mime/master/Mime.types"
doc = readLines(url)

doc = doc[ !grepl("^#", doc)]
doc = doc[ !(doc %in% "") ]
doc = gsub("\t+", "\t", doc)
df = strsplit(doc, "\t")
df = lapply(df, function(x) {
  mime_type = x[1]
  ext = NA
  if (length(x) > 1) {
    xx = x[-1]
    ext = unlist(strsplit(xx, split = " "))
  }
  data_frame(ext = ext,
             mime_type = mime_type)
})
df = bind_rows(df)
df = df %>% 
  filter(!is.na(ext),
         !is.na(mime_type))
tab = full_join(tab, df)

mime_df = data_frame(
  mime_type = mime::mimemap,
  ext = paste0(".", names(mime::mimemap))
)

mime_extensions = full_join(tab, mime_df)
mime_extensions[ mime_extensions$mime_type == "text/plain",]