DevLab-Duke / mlp-data-intro

Repository for: Tracking Civic Space in Developing Countries with a High-Quality Corpus of Domestic Media and Language Models
0 stars 0 forks source link

Automated Source Printing #2

Closed jrspringman closed 7 hours ago

jrspringman commented 1 week ago

Rather than having a static (and very ugly) appendix of sources, we should have an automated script that prints the full list of sources and the most recent month of available data for each country. The updated list of both elements is always available in ML4P-Civic-Space-Forecasting/ml4p.forecast/R/constants.R. Togbedji actually already did this for the sources in the pipeline report, so we should obviously use this to create a separate rmd appendix in this repo that we can submit alongside the main paper compiled in latex. This tells the world what sources we're using!

We just need an rmd file that pulls constants.R into this repo, loads it, and then runs the code below.

`## Appendix 1: List of Digital News Sources Being Used by Country and Region

library(knitr)
opts_chunk$set(tidy.opts=list(width.cutoff=80),tidy=TRUE)
#print(local_source_select("Ghana"))
#isources
cat("  -  International Sources:")

counter <- 0  # Initialize counter
for (country in isources) {
    country <- sub("\\.csv$", "", country)
    cat(paste0(country, ", "))
    counter <- counter + 1
    if (counter %% 6 == 0) {  # Check if counter is divisible by 5
        cat("\n")  # Print newline after every 5 countries
    }
}

Sub-Saharan Africa:

countries_afr <- c("Angola", "Benin", "Cameroon", "DR Congo", "Ethiopia", "Ghana", "Kenya", "Liberia", "Mali", "Malawi", "Mauritania", "Mozambique", "Niger", "Nigeria", "Rwanda", "Senegal", "South Africa", "Tanzania", "Uganda", "Zambia", "Zimbabwe")

cat("  -   Africa Regional Sources:", "\n")
cat("africanews.com", "theeastafrican.co.ke", "iwpr.net\n\n")

# Loop through countries
for (country in countries_afr) {
  cat(paste0("  -  ", country, ":\n"))

  sources = local_source_select(country)$lsources
  sources = gsub("\\.csv$", "", sources)

  # Determine the number of columns based on the country
  num_columns <- ifelse(country == "Liberia", 3, 5)

  counter <- 0  # Reset the counter for each country

  # Loop through sources
  for (source in sources) {
    counter <- counter + 1
    cat(paste0( source))

    # Check if the counter reaches the specified number of columns
    if (counter %% num_columns == 0) {
      cat("\n")
    } else {
      cat(", ")
    }
  }

  cat("\n\n")
}

Middle East and North Africa

# Define the MENA countries
countries_mena <- c("Morocco", "Tunisia", "Turkey")

# Loop through MENA countries
for (country in countries_mena) {
  cat(paste0("  -  ", country, ":\n"))

  # Get the sources for the current country
  sources <- local_source_select(country)$lsources
  sources <- gsub("\\.csv$", "", sources)

  # Print the sources with bullet points
  for (i in seq_along(sources)) {
    cat(paste0( sources[i]))

    # Check if it's not the last source for the country
    if (i < length(sources)) {
      cat(", ")

      # Print newline after listing 3 sources
      if (i %% 5 == 0) {
        cat("\n")
      }
    } else {
      cat("\n\n")  # Add an extra newline after listing all sources for the country
    }
  }
}

Eastern Europe

# Define the Eastern European regional sources
cat("  -   Eastern Europe Regional Sources:\n")
cat("euronews.com/tag/eastern-europe", "neweasterneurope.edu", "balkaninsight.com", "iwpr.net\n\n")

# Define the Eastern European countries
countries_est_eu <- c("Albania", "Armenia", "Azerbaijan", "Belarus", "Georgia", "Hungary", "Macedonia", "Moldova", "Kosovo", "Ukraine", "Serbia")

# Loop through Eastern European countries
for (country in countries_est_eu) {
  cat(paste0("  -  ", country, ":\n"))

  # Get the sources for the current country
  sources <- local_source_select(country)$lsources
  sources <- gsub("\\.csv$", "", sources)

  # Print the sources with bullet points
  for (i in seq_along(sources)) {
    cat(paste0(sources[i]))

    # Check if it's not the last source for the country
    if (i < length(sources)) {
      cat(", ")

      # Print newline after listing 3 sources
      if (i %% 5 == 0) {
        cat("\n")
      }
    } else {
      cat("\n\n")  # Add an extra newline after listing all sources for the country
    }
  }
}

Latin America and the Caribbean:

# Define the Latin America regional sources
cat("  -   Latin America Regional Sources:\n")
cat("elpais.com", "cnnespanol.cnn.com", "iwpr.net\n\n")

# Define the Latin American countries
countries_lacar <- c("Colombia", "Ecuador", "El Salvador", "Guatemala", "Honduras", "Jamaica", "Nicaragua", "Paraguay", "Peru")

# Loop through Latin American countries
for (country in countries_lacar) {
  cat(paste0("  -  ", country, ":\n"))

  # Get the sources for the current country
  sources <- local_source_select(country)$lsources
  sources <- gsub("\\.csv$", "", sources)

  counter <- 0  # Reset the counter for each country

  # Loop through sources
  for (source in sources) {
    cat(paste0(source))
    counter <- counter + 1

    # Check if it's not the last source for the country
    if (counter < length(sources)) {
      cat(", ")

      # Print newline after listing 5 sources
      if (counter %% 5 == 0) {
        cat("\n")
      }
    } else {
      cat("\n\n")  # Add an extra newline after listing all sources for the country
    }
  }
}

Asia:


# Define the Asia regional sources
cat("  -   Asia Regional Sources:\n")
cat("asiatimes.com", "asia.nikkei.com", "iwpr.net\n\n")

# Define the Asian countries
countries_asia <- c("Bangladesh", "Cambodia", "Indonesia", "India", "Kazakhstan", "Kyrgyzstan", "Malaysia", "Philippines", "Sri Lanka", "Uzbekistan")

# Loop through Asian countries
for (country in countries_asia) {
  cat(paste0("  -  ", country, ":\n"))

  # Get the sources for the current country
  sources <- local_source_select(country)$lsources
  sources <- gsub("\\.csv$", "", sources)

  counter <- 0  # Reset the counter for each country

  # Loop through sources
  for (source in sources) {
    cat(paste0(source))
    counter <- counter + 1

    # Check if it's not the last source for the country
    if (counter < length(sources)) {
      cat(", ")

      # Print newline after listing 5 sources
      if (counter %% 5 == 0) {
        cat("\n")
      }
    } else {
      cat("\n\n")  # Add an extra newline after listing all sources for the country
    }
  }
}
`
dmoratz commented 7 hours ago

Created an appendix .qmd file that does this and pulls in everything else from the overleaf appendix, plus adds the civic space definitions.