Process a PDF
Submit a PDF (or EPUB, DOCX, PPTX, and other document formats) for OCR processing. Results are available as Mathpix Markdown, DOCX, LaTeX, HTML, and more.
PDF processing is asynchronous. You submit the document, then poll for status and download results when complete. For real-time partial results, use the streaming option.
Submit via URL
- mpxpy
- cURL
- Python
- TypeScript
- Go
- Java
from mpxpy.mathpix_client import MathpixClient
client = MathpixClient(app_id="APP_ID", app_key="APP_KEY")
pdf = client.pdf_new(
url="https://cdn.mathpix.com/examples/cs229-notes1.pdf",
convert_to_docx=True,
convert_to_tex_zip=True,
)
print(pdf.pdf_id)
curl -X POST https://api.mathpix.com/v3/pdf \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY' \
-H 'Content-Type: application/json' \
--data '{"url": "https://cdn.mathpix.com/examples/cs229-notes1.pdf", "conversion_formats": {"docx": true, "tex.zip": true}}'
import requests
r = requests.post("https://api.mathpix.com/v3/pdf",
json={
"url": "https://cdn.mathpix.com/examples/cs229-notes1.pdf",
"conversion_formats": {"docx": True, "tex.zip": True}
},
headers={
"app_id": "APP_ID",
"app_key": "APP_KEY",
"Content-type": "application/json"
}
)
print(r.json()) # {"pdf_id": "..."}
const response = await fetch("https://api.mathpix.com/v3/pdf", {
method: "POST",
headers: {
app_id: "APP_ID",
app_key: "APP_KEY",
"Content-Type": "application/json",
},
body: JSON.stringify({
url: "https://cdn.mathpix.com/examples/cs229-notes1.pdf",
conversion_formats: { docx: true, "tex.zip": true },
}),
});
const { pdf_id } = await response.json();
console.log(`PDF ID: ${pdf_id}`);
body := bytes.NewBufferString(`{
"url": "https://cdn.mathpix.com/examples/cs229-notes1.pdf",
"conversion_formats": {"docx": true, "tex.zip": true}
}`)
req, _ := http.NewRequest("POST", "https://api.mathpix.com/v3/pdf", body)
req.Header.Set("app_id", "APP_ID")
req.Header.Set("app_key", "APP_KEY")
req.Header.Set("Content-Type", "application/json")
resp, _ := http.DefaultClient.Do(req)
defer resp.Body.Close()
result, _ := io.ReadAll(resp.Body)
fmt.Println(string(result)) // {"pdf_id": "..."}
HttpClient client = HttpClient.newHttpClient();
String body = """
{
"url": "https://cdn.mathpix.com/examples/cs229-notes1.pdf",
"conversion_formats": { "docx": true, "tex.zip": true }
}
""";
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("https://api.mathpix.com/v3/pdf"))
.header("app_id", "APP_ID")
.header("app_key", "APP_KEY")
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(body))
.build();
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println(response.body());
Submit via file upload
- mpxpy
- cURL
- Python
- TypeScript
- Go
- Java
from mpxpy.mathpix_client import MathpixClient
client = MathpixClient(app_id="APP_ID", app_key="APP_KEY")
pdf = client.pdf_new(
file_path="document.pdf",
convert_to_docx=True,
convert_to_tex_zip=True,
)
print(pdf.pdf_id)
curl -X POST https://api.mathpix.com/v3/pdf \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY' \
--form 'file=@"document.pdf"' \
--form 'options_json="{\"conversion_formats\": {\"docx\": true, \"tex.zip\": true}}"'
import requests, json
r = requests.post("https://api.mathpix.com/v3/pdf",
files={"file": open("document.pdf", "rb")},
data={
"options_json": json.dumps({
"conversion_formats": {"docx": True, "tex.zip": True}
})
},
headers={
"app_id": "APP_ID",
"app_key": "APP_KEY"
}
)
print(r.json()) # {"pdf_id": "..."}
import fs from "fs";
const form = new FormData();
form.append("file", new Blob([fs.readFileSync("document.pdf")]));
form.append("options_json", JSON.stringify({
conversion_formats: { docx: true, "tex.zip": true },
}));
const response = await fetch("https://api.mathpix.com/v3/pdf", {
method: "POST",
headers: { app_id: "APP_ID", app_key: "APP_KEY" },
body: form,
});
const { pdf_id } = await response.json();
console.log(`PDF ID: ${pdf_id}`);
var buf bytes.Buffer
w := multipart.NewWriter(&buf)
fw, _ := w.CreateFormFile("file", "document.pdf")
f, _ := os.Open("document.pdf")
io.Copy(fw, f)
f.Close()
w.WriteField("options_json", `{"conversion_formats":{"docx":true,"tex.zip":true}}`)
w.Close()
req, _ := http.NewRequest("POST", "https://api.mathpix.com/v3/pdf", &buf)
req.Header.Set("app_id", "APP_ID")
req.Header.Set("app_key", "APP_KEY")
req.Header.Set("Content-Type", w.FormDataContentType())
resp, _ := http.DefaultClient.Do(req)
defer resp.Body.Close()
result, _ := io.ReadAll(resp.Body)
fmt.Println(string(result))
HttpClient client = HttpClient.newHttpClient();
Path file = Path.of("document.pdf");
String boundary = "----FormBoundary" + System.currentTimeMillis();
String optionsJson = "{\"conversion_formats\": {\"docx\": true, \"tex.zip\": true}}";
byte[] fileBytes = Files.readAllBytes(file);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
baos.write(("--" + boundary + "\r\nContent-Disposition: form-data; name=\"file\"; filename=\""
+ file.getFileName() + "\"\r\nContent-Type: application/octet-stream\r\n\r\n").getBytes());
baos.write(fileBytes);
baos.write(("\r\n--" + boundary + "\r\nContent-Disposition: form-data; name=\"options_json\"\r\n\r\n"
+ optionsJson + "\r\n--" + boundary + "--\r\n").getBytes());
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("https://api.mathpix.com/v3/pdf"))
.header("app_id", "APP_ID")
.header("app_key", "APP_KEY")
.header("Content-Type", "multipart/form-data; boundary=" + boundary)
.POST(HttpRequest.BodyPublishers.ofByteArray(baos.toByteArray()))
.build();
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println(response.body());
Poll processing status
After submitting, poll until status is "completed":
- mpxpy
- cURL
- Python
- TypeScript
- Go
- Java
# wait_until_complete handles polling automatically
pdf.wait_until_complete(timeout=60)
print(pdf.pdf_status())
curl https://api.mathpix.com/v3/pdf/PDF_ID \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY'
import requests, time
headers = {"app_id": "APP_ID", "app_key": "APP_KEY"}
pdf_id = "YOUR_PDF_ID"
while True:
r = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers)
status = r.json()
print(f"Status: {status['status']}, {status.get('percent_done', 0)}% done")
if status["status"] in ("completed", "error"):
break
time.sleep(5)
const headers = { app_id: "APP_ID", app_key: "APP_KEY" };
const pdfId = "YOUR_PDF_ID";
while (true) {
const response = await fetch(`https://api.mathpix.com/v3/pdf/${pdfId}`, { headers });
const status = await response.json();
console.log(`Status: ${status.status}, ${status.percent_done ?? 0}% done`);
if (status.status === "completed" || status.status === "error") break;
await new Promise((r) => setTimeout(r, 5000));
}
pdfId := "YOUR_PDF_ID"
delay := 5 * time.Second
for i := 0; i < 120; i++ { // timeout after ~10 minutes
req, _ := http.NewRequest("GET", "https://api.mathpix.com/v3/pdf/"+pdfId, nil)
req.Header.Set("app_id", "APP_ID")
req.Header.Set("app_key", "APP_KEY")
resp, _ := http.DefaultClient.Do(req)
result, _ := io.ReadAll(resp.Body)
resp.Body.Close()
fmt.Println(string(result))
if bytes.Contains(result, []byte(`"completed"`)) || bytes.Contains(result, []byte(`"error"`)) {
break
}
time.Sleep(delay)
if delay < 30*time.Second {
delay = delay * 3 / 2 // backoff up to 30s
}
}
HttpClient client = HttpClient.newHttpClient();
String pdfId = "YOUR_PDF_ID";
while (true) {
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("https://api.mathpix.com/v3/pdf/" + pdfId))
.header("app_id", "APP_ID")
.header("app_key", "APP_KEY")
.GET().build();
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println(response.body());
if (response.body().contains("\"completed\"") || response.body().contains("\"error\"")) break;
Thread.sleep(5000);
}
Response while processing:
{
"status": "split",
"num_pages": 12,
"num_pages_completed": 4,
"percent_done": 33.33
}
Response when complete:
{
"status": "completed",
"num_pages": 12,
"num_pages_completed": 12,
"percent_done": 100
}
Download results
Once processing is complete, download results by appending the format extension:
- mpxpy
- cURL
- Python
- TypeScript
- Go
- Java
# Save to files
pdf.to_md_file(path="result.mmd")
pdf.to_docx_file(path="result.docx")
pdf.to_tex_zip_file(path="result.tex.zip")
pdf.to_lines_json_file(path="lines.json")
# Or get content in memory
md_text = pdf.to_md_text() # str
docx_bytes = pdf.to_docx_bytes() # bytes
lines = pdf.to_lines_json() # dict
# Mathpix Markdown
curl https://api.mathpix.com/v3/pdf/PDF_ID.mmd \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY' > result.mmd
# DOCX (requires conversion_formats: {"docx": true})
curl https://api.mathpix.com/v3/pdf/PDF_ID.docx \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY' > result.docx
# LaTeX zip
curl https://api.mathpix.com/v3/pdf/PDF_ID.tex.zip \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY' > result.tex.zip
# Line-by-line JSON data
curl https://api.mathpix.com/v3/pdf/PDF_ID.lines.json \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY' > lines.json
import requests
headers = {"app_id": "APP_ID", "app_key": "APP_KEY"}
pdf_id = "YOUR_PDF_ID"
# Mathpix Markdown
r = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}.mmd", headers=headers)
with open("result.mmd", "w") as f:
f.write(r.text)
# DOCX (requires conversion_formats: {"docx": true})
r = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}.docx", headers=headers)
with open("result.docx", "wb") as f:
f.write(r.content)
# LaTeX zip
r = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex.zip", headers=headers)
with open("result.tex.zip", "wb") as f:
f.write(r.content)
import fs from "fs";
const headers = { app_id: "APP_ID", app_key: "APP_KEY" };
const pdfId = "YOUR_PDF_ID";
// Mathpix Markdown
const mmd = await fetch(`https://api.mathpix.com/v3/pdf/${pdfId}.mmd`, { headers });
fs.writeFileSync("result.mmd", await mmd.text());
// DOCX
const docx = await fetch(`https://api.mathpix.com/v3/pdf/${pdfId}.docx`, { headers });
fs.writeFileSync("result.docx", Buffer.from(await docx.arrayBuffer()));
pdfId := "YOUR_PDF_ID"
for _, ext := range []string{"mmd", "docx", "tex.zip"} {
req, _ := http.NewRequest("GET", "https://api.mathpix.com/v3/pdf/"+pdfId+"."+ext, nil)
req.Header.Set("app_id", "APP_ID")
req.Header.Set("app_key", "APP_KEY")
resp, _ := http.DefaultClient.Do(req)
data, _ := io.ReadAll(resp.Body)
resp.Body.Close()
os.WriteFile("result."+ext, data, 0644)
}
HttpClient client = HttpClient.newHttpClient();
String pdfId = "YOUR_PDF_ID";
// Mathpix Markdown
HttpRequest mmdReq = HttpRequest.newBuilder()
.uri(URI.create("https://api.mathpix.com/v3/pdf/" + pdfId + ".mmd"))
.header("app_id", "APP_ID").header("app_key", "APP_KEY").GET().build();
HttpResponse<String> mmdResp = client.send(mmdReq, HttpResponse.BodyHandlers.ofString());
Files.writeString(Path.of("result.mmd"), mmdResp.body());
// DOCX
HttpRequest docxReq = HttpRequest.newBuilder()
.uri(URI.create("https://api.mathpix.com/v3/pdf/" + pdfId + ".docx"))
.header("app_id", "APP_ID").header("app_key", "APP_KEY").GET().build();
HttpResponse<byte[]> docxResp = client.send(docxReq, HttpResponse.BodyHandlers.ofByteArray());
Files.write(Path.of("result.docx"), docxResp.body());
Check conversion status
If you requested conversion_formats, check their status separately:
- mpxpy
- cURL
- Python
- TypeScript
- Go
- Java
print(pdf.pdf_conversion_status())
curl https://api.mathpix.com/v3/converter/PDF_ID \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY'
import requests
headers = {"app_id": "APP_ID", "app_key": "APP_KEY"}
r = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}", headers=headers)
print(r.json())
const response = await fetch("https://api.mathpix.com/v3/converter/PDF_ID", {
headers: { app_id: "APP_ID", app_key: "APP_KEY" },
});
console.log(await response.json());
req, _ := http.NewRequest("GET", "https://api.mathpix.com/v3/converter/PDF_ID", nil)
req.Header.Set("app_id", "APP_ID")
req.Header.Set("app_key", "APP_KEY")
resp, _ := http.DefaultClient.Do(req)
defer resp.Body.Close()
result, _ := io.ReadAll(resp.Body)
fmt.Println(string(result))
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("https://api.mathpix.com/v3/converter/PDF_ID"))
.header("app_id", "APP_ID").header("app_key", "APP_KEY").GET().build();
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println(response.body());
Response:
{
"status": "completed",
"conversion_status": {
"docx": { "status": "completed" },
"tex.zip": { "status": "completed" }
}
}
Stream pages
For lower time-to-first-data, enable streaming to receive page results via server-sent events (SSE) as each page completes:
- cURL
- Python
- TypeScript
- Go
- Java
# 1. Submit with streaming enabled
curl -X POST https://api.mathpix.com/v3/pdf \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY' \
-H 'Content-Type: application/json' \
--data '{"url": "https://cdn.mathpix.com/examples/cs229-notes1.pdf", "streaming": true}'
# 2. Connect to the SSE stream
curl https://api.mathpix.com/v3/pdf/PDF_ID/stream \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY'
import requests, json
headers = {"app_id": "APP_ID", "app_key": "APP_KEY", "Content-type": "application/json"}
# 1. Submit with streaming enabled
r = requests.post("https://api.mathpix.com/v3/pdf",
json={"url": "https://cdn.mathpix.com/examples/cs229-notes1.pdf", "streaming": True},
headers=headers
)
pdf_id = r.json()["pdf_id"]
# 2. Stream results via SSE
r = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}/stream",
headers=headers, stream=True)
for line in r.iter_lines():
if line:
page = json.loads(line)
print(f"Page {page['page_idx']}/{page['pdf_selected_len']}: {page['text'][:100]}...")
const headers = {
app_id: "APP_ID",
app_key: "APP_KEY",
"Content-Type": "application/json",
};
// 1. Submit with streaming enabled
const submit = await fetch("https://api.mathpix.com/v3/pdf", {
method: "POST",
headers,
body: JSON.stringify({
url: "https://cdn.mathpix.com/examples/cs229-notes1.pdf",
streaming: true,
}),
});
const { pdf_id } = await submit.json();
// 2. Stream results via SSE
const stream = await fetch(`https://api.mathpix.com/v3/pdf/${pdf_id}/stream`, { headers });
const reader = stream.body!.getReader();
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
console.log(decoder.decode(value));
}
// 1. Submit with streaming enabled
body := bytes.NewBufferString(`{
"url": "https://cdn.mathpix.com/examples/cs229-notes1.pdf",
"streaming": true
}`)
req, _ := http.NewRequest("POST", "https://api.mathpix.com/v3/pdf", body)
req.Header.Set("app_id", "APP_ID")
req.Header.Set("app_key", "APP_KEY")
req.Header.Set("Content-Type", "application/json")
resp, _ := http.DefaultClient.Do(req)
submitBody, _ := io.ReadAll(resp.Body)
resp.Body.Close()
// 2. Stream results via SSE (parse pdf_id from response)
pdfId := "YOUR_PDF_ID"
req, _ = http.NewRequest("GET", "https://api.mathpix.com/v3/pdf/"+pdfId+"/stream", nil)
req.Header.Set("app_id", "APP_ID")
req.Header.Set("app_key", "APP_KEY")
resp, _ = http.DefaultClient.Do(req)
defer resp.Body.Close()
scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
fmt.Println(scanner.Text())
}
HttpClient client = HttpClient.newHttpClient();
String body = """
{"url": "https://cdn.mathpix.com/examples/cs229-notes1.pdf", "streaming": true}
""";
// 1. Submit with streaming enabled
HttpRequest submit = HttpRequest.newBuilder()
.uri(URI.create("https://api.mathpix.com/v3/pdf"))
.header("app_id", "APP_ID").header("app_key", "APP_KEY")
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(body))
.build();
HttpResponse<String> resp = client.send(submit, HttpResponse.BodyHandlers.ofString());
String pdfId = resp.body().replaceAll(".*\"pdf_id\":\"([^\"]+)\".*", "$1");
// 2. Stream results via SSE
HttpRequest stream = HttpRequest.newBuilder()
.uri(URI.create("https://api.mathpix.com/v3/pdf/" + pdfId + "/stream"))
.header("app_id", "APP_ID").header("app_key", "APP_KEY").GET().build();
HttpResponse<java.util.stream.Stream<String>> sse = client.send(stream,
HttpResponse.BodyHandlers.ofLines());
sse.body().forEach(System.out::println);
Pages are streamed one JSON object at a time. Pages are not guaranteed to be in order, although they generally will be.
Process specific pages
Use page_ranges to process only certain pages:
{
"url": "https://cdn.mathpix.com/examples/cs229-notes1.pdf",
"page_ranges": "2,4-6"
}
This selects pages [2, 4, 5, 6]. You can also use negative indices: "2 - -2" selects all pages from the second to the next-to-last.
Delete results
Permanently delete a PDF's output data when you no longer need it:
- cURL
- Python
- TypeScript
- Go
- Java
curl -X DELETE https://api.mathpix.com/v3/pdf/PDF_ID \
-H 'app_id: APP_ID' \
-H 'app_key: APP_KEY'
import requests
r = requests.delete(f"https://api.mathpix.com/v3/pdf/{pdf_id}",
headers={"app_id": "APP_ID", "app_key": "APP_KEY"})
print(r.status_code)
const response = await fetch("https://api.mathpix.com/v3/pdf/PDF_ID", {
method: "DELETE",
headers: { app_id: "APP_ID", app_key: "APP_KEY" },
});
console.log(response.status);
req, _ := http.NewRequest("DELETE", "https://api.mathpix.com/v3/pdf/PDF_ID", nil)
req.Header.Set("app_id", "APP_ID")
req.Header.Set("app_key", "APP_KEY")
resp, _ := http.DefaultClient.Do(req)
defer resp.Body.Close()
fmt.Println(resp.StatusCode)
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("https://api.mathpix.com/v3/pdf/PDF_ID"))
.header("app_id", "APP_ID").header("app_key", "APP_KEY")
.DELETE().build();
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println(response.statusCode());
Download and store files locally before deleting if you need to keep them. Deletion is permanent.
Supported formats
Input: PDF, EPUB, DOCX, PPTX, AZW/AZW3/KFX, MOBI, DJVU, DOC, WPD, ODT
Output: MMD, MD, DOCX, LaTeX zip, HTML, PDF (with HTML or LaTeX rendering), PPTX, and ZIP variants with images
Next steps
- POST v3/pdf reference — Full request parameters, response schema, streaming, status polling, and lines data
- POST v3/converter reference — Convert MMD text to other formats directly
- Authentication — How to get your API keys