Getting started with Vision OCR
This section describes how to recognize text in an image or file using the Vision OCR API.
Getting started
To use the examples, install cURL
- On the Yandex Cloud Billing
page, make sure you have a billing account that is eitherACTIVEorTRIAL_ACTIVE. If you do not have a billing account yet, create one. - Get an IAM token, which is required for authentication.
- Get the ID of any folder for which your account has the
ai.vision.userrole or higher. - Specify the ID in the
x-folder-idheader.
Recognize text
You can use any recognition model from this list. As an example, we will use the page model which can recognize any amount of text in an image:
-
Prepare an image file that meets the requirements:
- The supported file formats are JPEG, PNG, and PDF. Specify the MIME type
of the file in themime_typeproperty. The default value isimage. - The maximum file size is 10 MB.
- The image size should not exceed 20 MP (height × width).
Tip
Need an example? download an image
of the penguin crossing road sign. - The supported file formats are JPEG, PNG, and PDF. Specify the MIME type
-
Encode the image file as Base64:
UNIXWindowsPowerShellPythonNode.jsJavaGobase64 -i input.jpg > output.txtC:> Base64.exe -e input.jpg > output.txt[Convert]::ToBase64String([IO.File]::ReadAllBytes("./input.jpg")) > output.txt# Import a library for encoding files in Base64. import base64 # Create a function to encode a file and return the results. def encode_file(file_path): with open(file_path, "rb") as fid: file_content = fid.read() return base64.b64encode(file_content).decode("utf-8")// Read the file contents to memory. var fs = require('fs'); var file = fs.readFileSync('/path/to/file'); // Get the file contents in Base64 format. var encoded = Buffer.from(file).toString('base64');// Import a library for encoding files in Base64. import org.apache.commons.codec.binary.Base64; // Get the file contents in Base64 format. byte[] fileData = Base64.encodeBase64(yourFile.getBytes());import ( "bufio" "encoding/base64" "io/ioutil" "os" ) // Open the file. f, _ := os.Open("/path/to/file") // Read the file contents. reader := bufio.NewReader(f) content, _ := ioutil.ReadAll(reader) // Get the file contents in Base64 format. base64.StdEncoding.EncodeToString(content) -
Create a file with the request body, e.g.,
body.json.body.json:
{ "mimeType": "JPEG", "languageCodes": ["*"], "model": "page", "content": "<base64_encoded_image>" }In the
contentproperty, specify the image file contents encoded as Base64.To automatically detect the text language, specify the
"languageCodes": ["*"]property in the configuration. -
Send a request using the recognize method and save the response to a file, e.g.,
output.json:UNIXPythonexport IAM_TOKEN=<IAM_token> curl \ --request POST \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${IAM_TOKEN}" \ --header "x-folder-id: <folder_ID>" \ --header "x-data-logging-enabled: true" \ --data '{ "mimeType": "JPEG", "languageCodes": ["ru","en"], "model": "handwritten", "content": "<base64_encoded_image>" }' \ https://ocr.api.cloud.yandex.net/ocr/v1/recognizeText \ --output output.jsonWhere:
<IAM_token>: Previously obtained IAM token.<folder_ID>: Previously obtained folder ID.
data = {"mimeType": <mime_type>, "languageCodes": ["ru","en"], "content": content} url = "https://ocr.api.cloud.yandex.net/ocr/v1/recognizeText" headers= {"Content-Type": "application/json", "Authorization": "Bearer {:s}".format(<IAM_token>), "x-folder-id": "<folder_ID>", "x-data-logging-enabled": "true"} w = requests.post(url=url, headers=headers, data=json.dumps(data))The result will consist of recognized blocks of text, lines, and words with their position on the image:
{ "result": { "textAnnotation": { "width": "1920", "height": "1280", "blocks": [ { "boundingBox": { "vertices": [ { "x": "460", "y": "777" }, { "x": "460", "y": "906" }, { "x": "810", "y": "906" }, { "x": "810", "y": "777" } ] }, "lines": [ { "boundingBox": { "vertices": [ { "x": "460", "y": "777" }, { "x": "460", "y": "820" }, { "x": "802", "y": "820" }, { "x": "802", "y": "777" } ] }, "text": "PENGUINS", "words": [ { "boundingBox": { "vertices": [ { "x": "460", "y": "768" }, { "x": "460", "y": "830" }, { "x": "802", "y": "830" }, { "x": "802", "y": "768" } ] }, "text": "PENGUINS", "entityIndex": "-1", "textSegments": [ { "startIndex": "0", "length": "8" } ] } ], "textSegments": [ { "startIndex": "0", "length": "8" } ], "orientation": "ANGLE_0" }, { "boundingBox": { "vertices": [ { "x": "489", "y": "861" }, { "x": "489", "y": "906" }, { "x": "810", "y": "906" }, { "x": "810", "y": "861" } ] }, "text": "CROSSING", "words": [ { "boundingBox": { "vertices": [ { "x": "489", "y": "852" }, { "x": "489", "y": "916" }, { "x": "810", "y": "916" }, { "x": "810", "y": "852" } ] }, "text": "CROSSING", "entityIndex": "-1", "textSegments": [ { "startIndex": "9", "length": "8" } ] } ], "textSegments": [ { "startIndex": "9", "length": "8" } ], "orientation": "ANGLE_0" } ], "languages": [ { "languageCode": "en" } ], "textSegments": [ { "startIndex": "0", "length": "17" } ], "layoutType": "LAYOUT_TYPE_TEXT" }, { "boundingBox": { "vertices": [ { "x": "547", "y": "989" }, { "x": "547", "y": "1046" }, { "x": "748", "y": "1046" }, { "x": "748", "y": "989" } ] }, "lines": [ { "boundingBox": { "vertices": [ { "x": "547", "y": "989" }, { "x": "547", "y": "1046" }, { "x": "748", "y": "1046" }, { "x": "748", "y": "989" } ] }, "text": "SLOW", "words": [ { "boundingBox": { "vertices": [ { "x": "547", "y": "983" }, { "x": "547", "y": "1054" }, { "x": "748", "y": "1054" }, { "x": "748", "y": "983" } ] }, "text": "SLOW", "entityIndex": "-1", "textSegments": [ { "startIndex": "18", "length": "4" } ] } ], "textSegments": [ { "startIndex": "18", "length": "4" } ], "orientation": "ANGLE_0" } ], "languages": [ { "languageCode": "en" } ], "textSegments": [ { "startIndex": "18", "length": "4" } ], "layoutType": "LAYOUT_TYPE_TEXT" } ], "entities": [], "tables": [], "fullText": "PENGUINS\nCROSSING\nSLOW\n", "rotate": "ANGLE_0", "markdown": " ", "pictures": [] }, "page": "0" } } -
To get all the words recognized in the image, find all values with the
textproperty.
Note
If the coordinates you got do not match the position of displayed elements, set up support for exif metadata in your image viewing tool or remove the Orientation attribute from the exif image section when running a transfer to the service.