Skip to content

Convert a PDF URL to HTML

When the source URL is a PDF, Microlink can still expose an HTML DOM for extraction. Request attr: 'html' to return the converted markup.

The following examples show how to use the Microlink API with CLI, cURL, JavaScript, Python, Ruby, PHP & Golang, targeting 'https://cdn.microlink.io/file-examples/sample.pdf' URL with 'data' & 'meta' API parameters:

CLI Microlink API example

microlink https://cdn.microlink.io/file-examples/sample.pdf&data.html.attr=html

cURL Microlink API example

curl -G "https://api.microlink.io" \
  -d "url=https://cdn.microlink.io/file-examples/sample.pdf" \
  -d "data.html.attr=html" \
  -d "meta=false"

JavaScript Microlink API example

import mql from '@microlink/mql'

const { data } = await mql('https://cdn.microlink.io/file-examples/sample.pdf', {
  data: {
    html: {
      attr: "html"
    }
  },
  meta: false
})

Python Microlink API example

import requests

url = "https://api.microlink.io/"

querystring = {
    "url": "https://cdn.microlink.io/file-examples/sample.pdf",
    "data.html.attr": "html",
    "meta": "false"
}

response = requests.get(url, params=querystring)

print(response.json())

Ruby Microlink API example

require 'uri'
require 'net/http'

base_url = "https://api.microlink.io/"

params = {
  url: "https://cdn.microlink.io/file-examples/sample.pdf",
  data.html.attr: "html",
  meta: "false"
}

uri = URI(base_url)
uri.query = URI.encode_www_form(params)

http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true

request = Net::HTTP::Get.new(uri)
response = http.request(request)

puts response.body

PHP Microlink API example

<?php

$baseUrl = "https://api.microlink.io/";

$params = [
    "url" => "https://cdn.microlink.io/file-examples/sample.pdf",
    "data.html.attr" => "html",
    "meta" => "false"
];

$query = http_build_query($params);
$url = $baseUrl . '?' . $query;

$curl = curl_init();

curl_setopt_array($curl, [
    CURLOPT_URL => $url,
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_ENCODING => "",
    CURLOPT_MAXREDIRS => 10,
    CURLOPT_TIMEOUT => 30,
    CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
    CURLOPT_CUSTOMREQUEST => "GET"
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
    echo "cURL Error #: " . $err;
} else {
    echo $response;
}

Golang Microlink API example

package main

import (
    "fmt"
    "net/http"
    "net/url"
    "io"
)

func main() {
    baseURL := "https://api.microlink.io"

    u, err := url.Parse(baseURL)
    if err != nil {
        panic(err)
    }
    q := u.Query()
    q.Set("url", "https://cdn.microlink.io/file-examples/sample.pdf")
    q.Set("data.html.attr", "html")
    q.Set("meta", "false")
    u.RawQuery = q.Encode()

    req, err := http.NewRequest("GET", u.String(), nil)
    if err != nil {
        panic(err)
    }

    client := &http.Client{}
    resp, err := client.Do(req)
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    body, err := io.ReadAll(resp.Body)
    if err != nil {
        panic(err)
    }

    fmt.Println(string(body))
}
Read the converted PDF markup from data.html.

Return HTML directly

Add embed: 'html' when the API URL should return HTML:

The following examples show how to use the Microlink API with CLI, cURL, JavaScript, Python, Ruby, PHP & Golang, targeting 'https://cdn.microlink.io/file-examples/sample.pdf' URL with 'data', 'meta' & 'embed' API parameters:

CLI Microlink API example

microlink https://cdn.microlink.io/file-examples/sample.pdf&data.html.attr=html&embed=html

cURL Microlink API example

curl -G "https://api.microlink.io" \
  -d "url=https://cdn.microlink.io/file-examples/sample.pdf" \
  -d "data.html.attr=html" \
  -d "meta=false" \
  -d "embed=html"

JavaScript Microlink API example

import mql from '@microlink/mql'

const { data } = await mql('https://cdn.microlink.io/file-examples/sample.pdf', {
  data: {
    html: {
      attr: "html"
    }
  },
  meta: false,
  embed: "html"
})

Python Microlink API example

import requests

url = "https://api.microlink.io/"

querystring = {
    "url": "https://cdn.microlink.io/file-examples/sample.pdf",
    "data.html.attr": "html",
    "meta": "false",
    "embed": "html"
}

response = requests.get(url, params=querystring)

print(response.json())

Ruby Microlink API example

require 'uri'
require 'net/http'

base_url = "https://api.microlink.io/"

params = {
  url: "https://cdn.microlink.io/file-examples/sample.pdf",
  data.html.attr: "html",
  meta: "false",
  embed: "html"
}

uri = URI(base_url)
uri.query = URI.encode_www_form(params)

http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true

request = Net::HTTP::Get.new(uri)
response = http.request(request)

puts response.body

PHP Microlink API example

<?php

$baseUrl = "https://api.microlink.io/";

$params = [
    "url" => "https://cdn.microlink.io/file-examples/sample.pdf",
    "data.html.attr" => "html",
    "meta" => "false",
    "embed" => "html"
];

$query = http_build_query($params);
$url = $baseUrl . '?' . $query;

$curl = curl_init();

curl_setopt_array($curl, [
    CURLOPT_URL => $url,
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_ENCODING => "",
    CURLOPT_MAXREDIRS => 10,
    CURLOPT_TIMEOUT => 30,
    CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
    CURLOPT_CUSTOMREQUEST => "GET"
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
    echo "cURL Error #: " . $err;
} else {
    echo $response;
}

Golang Microlink API example

package main

import (
    "fmt"
    "net/http"
    "net/url"
    "io"
)

func main() {
    baseURL := "https://api.microlink.io"

    u, err := url.Parse(baseURL)
    if err != nil {
        panic(err)
    }
    q := u.Query()
    q.Set("url", "https://cdn.microlink.io/file-examples/sample.pdf")
    q.Set("data.html.attr", "html")
    q.Set("meta", "false")
    q.Set("embed", "html")
    u.RawQuery = q.Encode()

    req, err := http.NewRequest("GET", u.String(), nil)
    if err != nil {
        panic(err)
    }

    client := &http.Client{}
    resp, err := client.Do(req)
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    body, err := io.ReadAll(resp.Body)
    if err != nil {
        panic(err)
    }

    fmt.Println(string(body))
}
The response body is HTML and can be stored, sanitized, or transformed by your own pipeline.
The same request as a raw URL:
https://api.microlink.io?url=https://cdn.microlink.io/file-examples/sample.pdf&data.html.attr=html&meta=false&embed=html

Extract just the body

If your consumer wants an embeddable fragment instead of the full HTML document, scope the rule to the body:

The following examples show how to use the Microlink API with CLI, cURL, JavaScript, Python, Ruby, PHP & Golang, targeting 'https://cdn.microlink.io/file-examples/sample.pdf' URL with 'data', 'meta' & 'embed' API parameters:

CLI Microlink API example

microlink https://cdn.microlink.io/file-examples/sample.pdf&data.html.selector=body&data.html.attr=html&embed=html

cURL Microlink API example

curl -G "https://api.microlink.io" \
  -d "url=https://cdn.microlink.io/file-examples/sample.pdf" \
  -d "data.html.selector=body" \
  -d "data.html.attr=html" \
  -d "meta=false" \
  -d "embed=html"

JavaScript Microlink API example

import mql from '@microlink/mql'

const { data } = await mql('https://cdn.microlink.io/file-examples/sample.pdf', {
  data: {
    html: {
      selector: "body",
      attr: "html"
    }
  },
  meta: false,
  embed: "html"
})

Python Microlink API example

import requests

url = "https://api.microlink.io/"

querystring = {
    "url": "https://cdn.microlink.io/file-examples/sample.pdf",
    "data.html.selector": "body",
    "data.html.attr": "html",
    "meta": "false",
    "embed": "html"
}

response = requests.get(url, params=querystring)

print(response.json())

Ruby Microlink API example

require 'uri'
require 'net/http'

base_url = "https://api.microlink.io/"

params = {
  url: "https://cdn.microlink.io/file-examples/sample.pdf",
  data.html.selector: "body",
  data.html.attr: "html",
  meta: "false",
  embed: "html"
}

uri = URI(base_url)
uri.query = URI.encode_www_form(params)

http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true

request = Net::HTTP::Get.new(uri)
response = http.request(request)

puts response.body

PHP Microlink API example

<?php

$baseUrl = "https://api.microlink.io/";

$params = [
    "url" => "https://cdn.microlink.io/file-examples/sample.pdf",
    "data.html.selector" => "body",
    "data.html.attr" => "html",
    "meta" => "false",
    "embed" => "html"
];

$query = http_build_query($params);
$url = $baseUrl . '?' . $query;

$curl = curl_init();

curl_setopt_array($curl, [
    CURLOPT_URL => $url,
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_ENCODING => "",
    CURLOPT_MAXREDIRS => 10,
    CURLOPT_TIMEOUT => 30,
    CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
    CURLOPT_CUSTOMREQUEST => "GET"
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
    echo "cURL Error #: " . $err;
} else {
    echo $response;
}

Golang Microlink API example

package main

import (
    "fmt"
    "net/http"
    "net/url"
    "io"
)

func main() {
    baseURL := "https://api.microlink.io"

    u, err := url.Parse(baseURL)
    if err != nil {
        panic(err)
    }
    q := u.Query()
    q.Set("url", "https://cdn.microlink.io/file-examples/sample.pdf")
    q.Set("data.html.selector", "body")
    q.Set("data.html.attr", "html")
    q.Set("meta", "false")
    q.Set("embed", "html")
    u.RawQuery = q.Encode()

    req, err := http.NewRequest("GET", u.String(), nil)
    if err != nil {
        panic(err)
    }

    client := &http.Client{}
    resp, err := client.Do(req)
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    body, err := io.ReadAll(resp.Body)
    if err != nil {
        panic(err)
    }

    fmt.Println(string(body))
}
Use the full document for archival or downstream parsing, and a body fragment for insertion into another page.

Know the PDF limit

This works best for PDFs with selectable text. Image-only scans may produce little useful HTML because there is no meaningful text layer to convert.

Next step

Use Convert a PDF URL to Markdown when the consumer is an LLM, search index, or Markdown-native system. Use Convert any URL to HTML for web pages.