DS4SD / docling

Get your documents ready for gen AI
https://ds4sd.github.io/docling
MIT License
9.3k stars 442 forks source link

Support Google Docs #298

Open vtempest opened 4 days ago

vtempest commented 4 days ago

Requested feature

Use their api to get the html for a Google Docs url ID if public or require login if private

Alternatives

...

PeterStaar-IBM commented 2 days ago

@vtempest Is there a library for that, or would we need to write it from scratch?

vodkar commented 1 day ago

There is a google-api-python-client from google with Apache license. Is it possible to use it?

PeterStaar-IBM commented 1 day ago

Yes, Apache should be good to go!

vodkar commented 1 day ago

I'm looking for an opportunity to contribute in this project. I have some of the ideas, how to implement this feature. But I want to discuss a plan for feature implementation. Should I open a discussion topic? Or we can discuss it right here?

cau-git commented 1 day ago

@vodkar thanks for being open to contribute. We can discuss here. Do you have any ideas where you would start from yourself so far?

vodkar commented 1 day ago

@vodkar thanks for being open to contribute. We can discuss here. Do you have any ideas where you would start from yourself so far?

I thought about splitting this task to the following subtasks:

  1. To implement new InputFormat named GOOGLE_DOC. At this stage, it will be possible to retrieve a content from public Google Docs, for example: https://docs.google.com/document/d/e/2PACX-1vRK-fDVc5WzQpAfjoEzDMudsG9GiMNG-LUTgZ6hrKxwTCCRvdUBDYDwpW9MiUOe_C5jryAfCQjgw-Jw/pub At this stage I plan just to convert input document to PDF, then to parse it with existing StandardPdfPipeline
  2. On the next step, it's possible to extract a text, images, etc. and to parse it from public document.
  3. The final step is to implement private docs access. But I will check, maybe this stage is not necessary, if authentication is mandatory to works with Google Docs through the API. So, it will be implemented at 1 or 2 stage.
vtempest commented 1 day ago

` const { google } = require('googleapis'); const express = require('express'); const fs = require('fs').promises; const path = require('path'); const session = require('express-session');

const app = express(); const PORT = process.env.PORT || 3000;

// Configure session middleware app.use(session({ secret: process.env.SESSION_SECRET || 'your-secret-key', resave: false, saveUninitialized: true, cookie: { secure: process.env.NODE_ENV === 'production' } }));

// Configure OAuth2 settings const CREDENTIALS_PATH = path.join(process.cwd(), 'credentials.json'); const TOKENS_DIR = path.join(process.cwd(), 'tokens'); const SCOPES = [ 'https://www.googleapis.com/auth/drive.file', 'https://www.googleapis.com/auth/documents' ];

class GoogleAuthManager { constructor() { this.oAuth2Client = null; }

async initialize() { try { const credentials = JSON.parse(await fs.readFile(CREDENTIALS_PATH)); const { client_secret, client_id, redirect_uris } = credentials.installed;

  this.oAuth2Client = new google.auth.OAuth2(
    client_id,
    client_secret,
    'http://localhost:3000/oauth2callback' // Override redirect_uris with our server URL
  );

  // Ensure tokens directory exists
  await fs.mkdir(TOKENS_DIR, { recursive: true });
} catch (error) {
  console.error('Error initializing OAuth client:', error);
  throw error;
}

}

async getAuthClient(userId) { if (!this.oAuth2Client) { throw new Error('OAuth client not initialized'); }

try {
  // Try to load existing token
  const tokenPath = path.join(TOKENS_DIR, `${userId}.token.json`);
  const token = JSON.parse(await fs.readFile(tokenPath));
  this.oAuth2Client.setCredentials(token);

  // Check if token needs refresh
  if (this.isTokenExpired(token)) {
    const newToken = await this.refreshToken(token);
    await this.saveToken(userId, newToken);
    this.oAuth2Client.setCredentials(newToken);
  }

  return this.oAuth2Client;
} catch (error) {
  // If token doesn't exist or is invalid, return null
  // The application should then redirect to the auth flow
  return null;
}

}

isTokenExpired(token) { if (!token.expiry_date) return true; // Return true if token expires in less than 5 minutes return token.expiry_date <= Date.now() + (5 60 1000); }

async refreshToken(token) { this.oAuth2Client.setCredentials(token); const { credentials } = await this.oAuth2Client.refreshAccessToken(); return credentials; }

getAuthUrl() { return this.oAuth2Client.generateAuthUrl({ access_type: 'offline', scope: SCOPES, prompt: 'consent' // Force prompt to ensure we get a refresh token }); }

async handleCallback(code, userId) { const { tokens } = await this.oAuth2Client.getToken(code); await this.saveToken(userId, tokens); return tokens; }

async saveToken(userId, tokens) { const tokenPath = path.join(TOKENS_DIR, ${userId}.token.json); await fs.writeFile(tokenPath, JSON.stringify(tokens)); } }

// Create and initialize the auth manager const authManager = new GoogleAuthManager(); authManager.initialize().catch(console.error);

// Express routes app.get('/auth/google', async (req, res) => { if (!req.session.userId) { req.session.userId = Date.now().toString(); // Simple user ID generation } const authUrl = authManager.getAuthUrl(); res.redirect(authUrl); });

app.get('/oauth2callback', async (req, res) => { const { code } = req.query; const userId = req.session.userId;

if (!code || !userId) { return res.status(400).send('Missing required parameters'); }

try { await authManager.handleCallback(code, userId); res.redirect('/success'); // Redirect to your application's success page } catch (error) { console.error('Error handling OAuth callback:', error); res.status(500).send('Authentication failed'); } });

// Example protected route that uses the auth client app.get('/docs/:docId', async (req, res) => { const userId = req.session.userId; if (!userId) { return res.redirect('/auth/google'); }

try { const authClient = await authManager.getAuthClient(userId); if (!authClient) { return res.redirect('/auth/google'); }

// Use the auth client to access Google Docs API
const docs = google.docs({ version: 'v1', auth: authClient });
const document = await docs.documents.get({
  documentId: req.params.docId
});

res.json(document.data);

} catch (error) { console.error('Error accessing document:', error); res.status(500).send('Error accessing document'); } });

// Success page app.get('/success', (req, res) => { res.send('Authentication successful! You can close this window.'); });

// Start the server app.listen(PORT, () => { console.log(Server running on http://localhost:${PORT}); });

// Example of using the auth manager in your application code async function getAuthorizedClient(userId) { try { const authClient = await authManager.getAuthClient(userId); if (!authClient) { // Handle unauthorized state in your application throw new Error('User not authorized'); } return authClient; } catch (error) { console.error('Error getting authorized client:', error); throw error; } }

module.exports = { GoogleAuthManager, getAuthorizedClient }

const { google } = require('googleapis'); const fs = require('fs').promises; const path = require('path');

// Configure OAuth2 credentials const CREDENTIALS_PATH = path.join(process.cwd(), 'credentials.json'); const TOKEN_PATH = path.join(process.cwd(), 'token.json'); const SCOPES = [ 'https://www.googleapis.com/auth/drive.file', 'https://www.googleapis.com/auth/documents' ];

async function authorize() { const credentials = JSON.parse(await fs.readFile(CREDENTIALS_PATH)); const { client_secret, client_id, redirect_uris } = credentials.installed; const oAuth2Client = new google.auth.OAuth2(client_id, client_secret, redirect_uris[0]);

try { const token = JSON.parse(await fs.readFile(TOKEN_PATH)); oAuth2Client.setCredentials(token); return oAuth2Client; } catch (error) { return getNewToken(oAuth2Client); } }

async function getNewToken(oAuth2Client) { const authUrl = oAuth2Client.generateAuthUrl({ access_type: 'offline', scope: SCOPES, });

console.log('Authorize this app by visiting:', authUrl);

// This is a simple example - in production, you'd want to use a proper web server const readline = require('readline'); const rl = readline.createInterface({ input: process.stdin, output: process.stdout, });

const code = await new Promise(resolve => { rl.question('Enter the code from that page here: ', code => { rl.close(); resolve(code); }); });

const { tokens } = await oAuth2Client.getToken(code); oAuth2Client.setCredentials(tokens); await fs.writeFile(TOKEN_PATH, JSON.stringify(tokens)); console.log('Token stored to', TOKEN_PATH); return oAuth2Client; }

async function readDocAsHtml(auth, documentId) { const docs = google.docs({ version: 'v1', auth });

try { // Get the document content const { data } = await docs.documents.get({ documentId: documentId });

// Convert document structure to HTML
let html = '<html><body>';

// Process document elements
if (data.body && data.body.content) {
  for (const element of data.body.content) {
    if (element.paragraph) {
      html += '<p>';
      for (const paragraphElement of element.paragraph.elements) {
        if (paragraphElement.textRun) {
          const text = paragraphElement.textRun.content;
          const style = paragraphElement.textRun.textStyle || {};

          // Apply basic styling
          let styledText = text;
          if (style.bold) styledText = `<strong>${styledText}</strong>`;
          if (style.italic) styledText = `<em>${styledText}</em>`;
          if (style.underline) styledText = `<u>${styledText}</u>`;

          html += styledText;
        }
      }
      html += '</p>';
    }
  }
}

html += '</body></html>';
return html;

} catch (error) { console.error('Error reading document:', error); throw error; } }

async function writeHtmlToDoc(auth, documentId, html) { const docs = google.docs({ version: 'v1', auth });

// Parse HTML content (you might want to use a proper HTML parser here) const cleanHtml = html .replace(/<[^>]*>/g, '') // Remove HTML tags for this simple example .trim();

try { // Create requests array for document updates const requests = [{ insertText: { location: { index: 1, // Insert at the beginning of the document }, text: cleanHtml } }];

// Execute the update
await docs.documents.batchUpdate({
  documentId: documentId,
  requestBody: {
    requests: requests
  }
});

console.log('Document updated successfully');

} catch (error) { console.error('Error updating document:', error); throw error; } }

// Example usage async function main() { try { // Authorize and get client const auth = await authorize();

// Example document ID (from the URL of a Google Doc)
const documentId = 'YOUR_DOCUMENT_ID';

// Read document as HTML
console.log('Reading document...');
const html = await readDocAsHtml(auth, documentId);
console.log('Document HTML:', html);

// Write HTML back to document
console.log('Writing HTML to document...');
const newHtml = '<p>Hello from Node.js!</p><p>This is a <strong>test</strong>.</p>';
await writeHtmlToDoc(auth, documentId, newHtml);

} catch (error) { console.error('Error:', error); } }

main(); `