pridiltal / staplr

PDF Toolkit. :paperclip: :hammer: :wrench: :scissors: :bookmark_tabs: :file_folder::paperclip: :bookmark: :construction: :construction_worker:
https://pridiltal.github.io/staplr/
265 stars 27 forks source link

get_fields with multi-line value #55

Closed clarkwrks closed 1 year ago

clarkwrks commented 3 years ago

Hello, when using get_fields on a TextField with a multi-line value, only characters before the first line break are returned.

> in_fields <- get_fields("in.pdf")
> in_fields$multilinetext
$type
[1] "Text"

$name
[1] "multilinetext"

$value
[1] "Hello"

> in_fields$multilinetext$value <- paste0(c("Hello", "World"),
+                                             collapse = '\n')
> in_fields$multilinetext$value
[1] "Hello\nWorld"
> set_fields("in.pdf", "out.pdf", in_fields)
> get_fields("out.pdf")$multilinetext$value
[1] "Hello"

The pdf renders correctly with the full field value (whether manually entered/saved or with set_fields). Still trying to figure out whether the loss is happening in pdftk or get_fields.

Has anyone else hit this limitation? Any ideas for a workaround?

Thank you for a very useful package!

oganm commented 3 years ago

This needs to be tested before I add it in properly but please try. A current limitation is if the newline begins with "Field" it will fail. This (at least when field names match exactly) also seems to be an issue with pdftk but need to look into it further

get_fields <- function(input_filepath = NULL, convert_field_names = FALSE, encoding_warning = TRUE){
  if(is.null(input_filepath)){
    #Choose the pdf file interactively
    input_filepath <- file.choose(new = FALSE)
  }

  input_filepath <- normalizePath(input_filepath,mustWork = TRUE)

  fieldsTemp <- tempfile()
  # generate the data field dump in a temporary file
  # theoratically, using dump_data_fields_utf8 can get rid of the need to use sub_demical
  # but this fails to process inputs containing stuff like emoji
  system_command <- paste(pdftk_cmd(),
                          shQuote(input_filepath),
                          'dump_data_fields','output',
                          shQuote(fieldsTemp))
  system(system_command)
  # here encoding isn't important because any unusual character is in numeric character references
  fields <- paste0(readLines(fieldsTemp,encoding = 'UTF-8'),
                   collapse = '\n')

  # https://stackoverflow.com/questions/5060076/convert-html-character-entity-encoding-in-r
  fields <- XML::xpathApply(XML::htmlParse(fields, asText=TRUE,encoding = "UTF-8"),
                            "//body//text()",
                            XML::xmlValue)[[1]]

  # fields <- stringr::str_replace_all(fields,'&lt;','<')
  # fields <- stringr::str_replace_all(fields,'&gt;','>')
  # fields <- stringr::str_replace_all(fields,'&quot;','"')
  # fields <- stringr::str_replace_all(fields,'&amp;','&')

  fields <- strsplit(fields, '---')[[1]][-1]

  # parse the fields

  badFields = c()

  fields <- lapply(fields,function(x){
    type <- stringr::str_extract(x,'(?<=FieldType: ).*?(?=\n|$)')
    name <- stringr::str_extract(x,'(?<=FieldName: ).*?(?=\n|$)')

    value <- stringr::str_extract_all(x,'(?<=FieldValue: )(.|\n)*?(?=(\nField)|(\n$))')[[1]]
    # sometimes there are multiple field values. It is currently unclear why this happens
    # but the example file I have only created the extra fieldValue when there was
    # an entry.
    if(length(value)>1){
      if(all(value == '')){
        value = ''
      } else if(length(value[value!=''])==1){
        value <- value[value!='']
      } else{
        warning(paste(name, "field has >1 FieldValues. set_fields only accepts fields of length one"))
      }
    }
    if(length(value)==0){
      # sometimes FieldValue is non populated
      # note the field is a button, this will cause it to be returned as an NA.
      # this is later handled by fdfEdit function which replaces the NA with
      # an empty string when filling the fdf file.
      value = ''
    }
    stateOptions <- stringr::str_extract_all(x,'(?<=FieldStateOption: ).*?(?=\n|$)')[[1]]

    if(length(stateOptions)>0){
      value <- factor(sub_decimal(value),levels = sapply(stateOptions,sub_decimal))
    }

    if(convert_field_names){
      name = encodeUTF8(name)
    } else if(encoding_warning && name != encodeUTF8(name)){
      assign("badFields",c(badFields,name),envir = parent.frame(n = 2))
    }

    return(list(type = type,
                name = name,
                value = sub_decimal(value)))
  })

  if(length(badFields)>0){
    warning(paste('some fields seems to include plain text UTF-8. Setting convert_field_names = TRUE might help. These fields have problematic names: \n', paste(badFields,collapse=', ')))
  }

  names(fields) <- sapply(fields,function(x){x$name})

  # remove typeless fields. it seems like nested hierarchies generate these typeless
  # fields that don't really exist and don't appear on the fdf file.
  fields = fields[sapply(fields,function(x){x$type})!='']

  # remove fields that don't appear on the FDF
  fdfLines <- get_fdf_lines(input_filepath)
  annotatedFDF <- fdfAnnotate(fdfLines)

  if(convert_field_names){
    annotatedFDF$fields <- sapply(annotatedFDF$fields,encodeUTF8)
  }
  fields = fields[names(fields) %in% annotatedFDF$fields]

  # class(fields) = 'pdf_fields'

  return(fields)
}
clarkwrks commented 2 years ago

That did the trick!

> in_fields$multilinetext$value <- paste0(c("Hello", "World"),
+                                         collapse = '\n')
> in_fields$multilinetext$value
[1] "Hello\nWorld"
> set_fields("in.pdf", "out.pdf", in_fields)
> get_fields("out.pdf")$multilinetext$value
[1] "Hello\nWorld"

Thank you!

oganm commented 2 years ago

Todo

oganm commented 1 year ago

This is now implemented but has some inevitable edge cases as the field dumps do not include any escape characters for field values that might look like field components. Generally one should avoid having fields that include "\nField" within them